/* armv8-curve25519
 *
 * Copyright (C) 2006-2020 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Generated using (from wolfssl):
 *   cd ../scripts
 *   ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S
 */
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
	.text
	.align	2
	.globl	fe_init
	.type	fe_init, %function
fe_init:
	ret
	.size	fe_init,.-fe_init
	.text
	.align	2
	.globl	fe_frombytes
	.type	fe_frombytes, %function
fe_frombytes:
	ldp	x2, x3, [x1]
	ldp	x4, x5, [x1, #16]
	and	x5, x5, #0x7fffffffffffffff
	stp	x2, x3, [x0]
	stp	x4, x5, [x0, #16]
	ret
	.size	fe_frombytes,.-fe_frombytes
	.text
	.align	2
	.globl	fe_tobytes
	.type	fe_tobytes, %function
fe_tobytes:
	mov	x7, #19
	ldp	x2, x3, [x1]
	ldp	x4, x5, [x1, #16]
	adds	x6, x2, x7
	adcs	x6, x3, xzr
	adcs	x6, x4, xzr
	adc	x6, x5, xzr
	and	x6, x7, x6, asr 63
	adds	x2, x2, x6
	adcs	x3, x3, xzr
	adcs	x4, x4, xzr
	adc	x5, x5, xzr
	and	x5, x5, #0x7fffffffffffffff
	stp	x2, x3, [x0]
	stp	x4, x5, [x0, #16]
	ret
	.size	fe_tobytes,.-fe_tobytes
	.text
	.align	2
	.globl	fe_1
	.type	fe_1, %function
fe_1:
	# Set one
	mov	x1, #1
	stp	x1, xzr, [x0]
	stp	xzr, xzr, [x0, #16]
	ret
	.size	fe_1,.-fe_1
	.text
	.align	2
	.globl	fe_0
	.type	fe_0, %function
fe_0:
	# Set zero
	stp	xzr, xzr, [x0]
	stp	xzr, xzr, [x0, #16]
	ret
	.size	fe_0,.-fe_0
	.text
	.align	2
	.globl	fe_copy
	.type	fe_copy, %function
fe_copy:
	# Copy
	ldp	x2, x3, [x1]
	ldp	x4, x5, [x1, #16]
	stp	x2, x3, [x0]
	stp	x4, x5, [x0, #16]
	ret
	.size	fe_copy,.-fe_copy
	.text
	.align	2
	.globl	fe_sub
	.type	fe_sub, %function
fe_sub:
	# Sub
	ldp	x3, x4, [x1]
	ldp	x5, x6, [x1, #16]
	ldp	x7, x8, [x2]
	ldp	x9, x10, [x2, #16]
	subs	x3, x3, x7
	sbcs	x4, x4, x8
	sbcs	x5, x5, x9
	sbcs	x6, x6, x10
	mov	x12, #-19
	csetm	x11, cc
	#   Mask the modulus
	and	x12, x11, x12
	and	x13, x11, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x3, x3, x12
	adcs	x4, x4, x11
	adcs	x5, x5, x11
	adc	x6, x6, x13
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ret
	.size	fe_sub,.-fe_sub
	.text
	.align	2
	.globl	fe_add
	.type	fe_add, %function
fe_add:
	# Add
	ldp	x3, x4, [x1]
	ldp	x5, x6, [x1, #16]
	ldp	x7, x8, [x2]
	ldp	x9, x10, [x2, #16]
	adds	x3, x3, x7
	adcs	x4, x4, x8
	adcs	x5, x5, x9
	adc	x6, x6, x10
	mov	x12, #-19
	asr	x11, x6, #63
	#   Mask the modulus
	and	x12, x11, x12
	and	x13, x11, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x3, x3, x12
	sbcs	x4, x4, x11
	sbcs	x5, x5, x11
	sbc	x6, x6, x13
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ret
	.size	fe_add,.-fe_add
	.text
	.align	2
	.globl	fe_neg
	.type	fe_neg, %function
fe_neg:
	ldp	x2, x3, [x1]
	ldp	x4, x5, [x1, #16]
	mov	x6, #-19
	mov	x7, #-1
	mov	x8, #-1
	mov	x9, #0x7fffffffffffffff
	subs	x6, x6, x2
	sbcs	x7, x7, x3
	sbcs	x8, x8, x4
	sbc	x9, x9, x5
	stp	x6, x7, [x0]
	stp	x8, x9, [x0, #16]
	ret
	.size	fe_neg,.-fe_neg
	.text
	.align	2
	.globl	fe_isnonzero
	.type	fe_isnonzero, %function
fe_isnonzero:
	mov	x6, #19
	ldp	x1, x2, [x0]
	ldp	x3, x4, [x0, #16]
	adds	x5, x1, x6
	adcs	x5, x2, xzr
	adcs	x5, x3, xzr
	adc	x5, x4, xzr
	and	x5, x6, x5, asr 63
	adds	x1, x1, x5
	adcs	x2, x2, xzr
	adcs	x3, x3, xzr
	adc	x4, x4, xzr
	and	x4, x4, #0x7fffffffffffffff
	orr	x0, x1, x2
	orr	x3, x3, x4
	orr	x0, x0, x3
	ret
	.size	fe_isnonzero,.-fe_isnonzero
	.text
	.align	2
	.globl	fe_isnegative
	.type	fe_isnegative, %function
fe_isnegative:
	mov	x6, #19
	ldp	x1, x2, [x0]
	ldp	x3, x4, [x0, #16]
	adds	x5, x1, x6
	adcs	x5, x2, xzr
	adcs	x5, x3, xzr
	adc	x5, x4, xzr
	and	x0, x1, #1
	eor	x0, x0, x5, lsr 63
	ret
	.size	fe_isnegative,.-fe_isnegative
	.text
	.align	2
	.globl	fe_cmov_table
	.type	fe_cmov_table, %function
fe_cmov_table:
	stp	x29, x30, [sp, #-128]!
	add	x29, sp, #0
	str	x17, [x29, #40]
	str	x19, [x29, #48]
	stp	x20, x21, [x29, #56]
	stp	x22, x23, [x29, #72]
	stp	x24, x25, [x29, #88]
	stp	x26, x27, [x29, #104]
	str	x28, [x29, #120]
	str	x0, [x29, #16]
	sxtb	x2, w2
	sbfx	x3, x2, #7, #1
	eor	x0, x2, x3
	sub	x0, x0, x3
	mov	x4, #1
	mov	x5, xzr
	mov	x6, xzr
	mov	x7, xzr
	mov	x8, #1
	mov	x9, xzr
	mov	x10, xzr
	mov	x11, xzr
	mov	x12, xzr
	mov	x13, xzr
	mov	x14, xzr
	mov	x15, xzr
	cmp	x0, #1
	ldp	x16, x17, [x1]
	ldp	x19, x20, [x1, #16]
	ldp	x21, x22, [x1, #32]
	ldp	x23, x24, [x1, #48]
	ldp	x25, x26, [x1, #64]
	ldp	x27, x28, [x1, #80]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	cmp	x0, #2
	ldp	x16, x17, [x1, #96]
	ldp	x19, x20, [x1, #112]
	ldp	x21, x22, [x1, #128]
	ldp	x23, x24, [x1, #144]
	ldp	x25, x26, [x1, #160]
	ldp	x27, x28, [x1, #176]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	cmp	x0, #3
	ldp	x16, x17, [x1, #192]
	ldp	x19, x20, [x1, #208]
	ldp	x21, x22, [x1, #224]
	ldp	x23, x24, [x1, #240]
	ldp	x25, x26, [x1, #256]
	ldp	x27, x28, [x1, #272]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	cmp	x0, #4
	ldp	x16, x17, [x1, #288]
	ldp	x19, x20, [x1, #304]
	ldp	x21, x22, [x1, #320]
	ldp	x23, x24, [x1, #336]
	ldp	x25, x26, [x1, #352]
	ldp	x27, x28, [x1, #368]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	add	x1, x1, #0x180
	cmp	x0, #5
	ldp	x16, x17, [x1]
	ldp	x19, x20, [x1, #16]
	ldp	x21, x22, [x1, #32]
	ldp	x23, x24, [x1, #48]
	ldp	x25, x26, [x1, #64]
	ldp	x27, x28, [x1, #80]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	cmp	x0, #6
	ldp	x16, x17, [x1, #96]
	ldp	x19, x20, [x1, #112]
	ldp	x21, x22, [x1, #128]
	ldp	x23, x24, [x1, #144]
	ldp	x25, x26, [x1, #160]
	ldp	x27, x28, [x1, #176]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	cmp	x0, #7
	ldp	x16, x17, [x1, #192]
	ldp	x19, x20, [x1, #208]
	ldp	x21, x22, [x1, #224]
	ldp	x23, x24, [x1, #240]
	ldp	x25, x26, [x1, #256]
	ldp	x27, x28, [x1, #272]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	cmp	x0, #8
	ldp	x16, x17, [x1, #288]
	ldp	x19, x20, [x1, #304]
	ldp	x21, x22, [x1, #320]
	ldp	x23, x24, [x1, #336]
	ldp	x25, x26, [x1, #352]
	ldp	x27, x28, [x1, #368]
	csel	x4, x16, x4, eq
	csel	x5, x17, x5, eq
	csel	x6, x19, x6, eq
	csel	x7, x20, x7, eq
	csel	x8, x21, x8, eq
	csel	x9, x22, x9, eq
	csel	x10, x23, x10, eq
	csel	x11, x24, x11, eq
	csel	x12, x25, x12, eq
	csel	x13, x26, x13, eq
	csel	x14, x27, x14, eq
	csel	x15, x28, x15, eq
	mov	x16, #-19
	mov	x17, #-1
	mov	x19, #-1
	mov	x20, #0x7fffffffffffffff
	subs	x16, x16, x12
	sbcs	x17, x17, x13
	sbcs	x19, x19, x14
	sbc	x20, x20, x15
	cmp	x2, #0
	mov	x3, x4
	csel	x4, x8, x4, lt
	csel	x8, x3, x8, lt
	mov	x3, x5
	csel	x5, x9, x5, lt
	csel	x9, x3, x9, lt
	mov	x3, x6
	csel	x6, x10, x6, lt
	csel	x10, x3, x10, lt
	mov	x3, x7
	csel	x7, x11, x7, lt
	csel	x11, x3, x11, lt
	csel	x12, x16, x12, lt
	csel	x13, x17, x13, lt
	csel	x14, x19, x14, lt
	csel	x15, x20, x15, lt
	ldr	x0, [x29, #16]
	stp	x4, x5, [x0]
	stp	x6, x7, [x0, #16]
	stp	x8, x9, [x0, #32]
	stp	x10, x11, [x0, #48]
	stp	x12, x13, [x0, #64]
	stp	x14, x15, [x0, #80]
	ldr	x17, [x29, #40]
	ldr	x19, [x29, #48]
	ldp	x20, x21, [x29, #56]
	ldp	x22, x23, [x29, #72]
	ldp	x24, x25, [x29, #88]
	ldp	x26, x27, [x29, #104]
	ldr	x28, [x29, #120]
	ldp	x29, x30, [sp], #0x80
	ret
	.size	fe_cmov_table,.-fe_cmov_table
	.text
	.align	2
	.globl	fe_mul
	.type	fe_mul, %function
fe_mul:
	stp	x29, x30, [sp, #-64]!
	add	x29, sp, #0
	str	x17, [x29, #24]
	str	x19, [x29, #32]
	stp	x20, x21, [x29, #40]
	str	x22, [x29, #56]
	# Multiply
	ldp	x14, x15, [x1]
	ldp	x16, x17, [x1, #16]
	ldp	x19, x20, [x2]
	ldp	x21, x22, [x2, #16]
	#  A[0] * B[0]
	mul	x6, x14, x19
	umulh	x7, x14, x19
	#  A[0] * B[1]
	mul	x3, x14, x20
	umulh	x8, x14, x20
	adds	x7, x7, x3
	adc	x8, x8, xzr
	#  A[1] * B[0]
	mul	x3, x15, x19
	umulh	x4, x15, x19
	adds	x7, x7, x3
	adcs	x8, x8, x4
	adc	x9, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x14, x21
	umulh	x4, x14, x21
	adds	x8, x8, x3
	adc	x9, x9, x4
	#  A[1] * B[1]
	mul	x3, x15, x20
	umulh	x4, x15, x20
	adds	x8, x8, x3
	adcs	x9, x9, x4
	adc	x10, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x16, x19
	umulh	x4, x16, x19
	adds	x8, x8, x3
	adcs	x9, x9, x4
	adc	x10, x10, xzr
	#  A[0] * B[3]
	mul	x3, x14, x22
	umulh	x4, x14, x22
	adds	x9, x9, x3
	adcs	x10, x10, x4
	adc	x11, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x15, x21
	umulh	x4, x15, x21
	adds	x9, x9, x3
	adcs	x10, x10, x4
	adc	x11, x11, xzr
	#  A[2] * B[1]
	mul	x3, x16, x20
	umulh	x4, x16, x20
	adds	x9, x9, x3
	adcs	x10, x10, x4
	adc	x11, x11, xzr
	#  A[3] * B[0]
	mul	x3, x17, x19
	umulh	x4, x17, x19
	adds	x9, x9, x3
	adcs	x10, x10, x4
	adc	x11, x11, xzr
	#  A[1] * B[3]
	mul	x3, x15, x22
	umulh	x4, x15, x22
	adds	x10, x10, x3
	adcs	x11, x11, x4
	adc	x12, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x16, x21
	umulh	x4, x16, x21
	adds	x10, x10, x3
	adcs	x11, x11, x4
	adc	x12, x12, xzr
	#  A[3] * B[1]
	mul	x3, x17, x20
	umulh	x4, x17, x20
	adds	x10, x10, x3
	adcs	x11, x11, x4
	adc	x12, x12, xzr
	#  A[2] * B[3]
	mul	x3, x16, x22
	umulh	x4, x16, x22
	adds	x11, x11, x3
	adcs	x12, x12, x4
	adc	x13, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x17, x21
	umulh	x4, x17, x21
	adds	x11, x11, x3
	adcs	x12, x12, x4
	adc	x13, x13, xzr
	#  A[3] * B[3]
	mul	x3, x17, x22
	umulh	x4, x17, x22
	adds	x12, x12, x3
	adc	x13, x13, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x13, x13, x12, #63
	extr	x12, x12, x11, #63
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	and	x9, x9, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x10
	umulh	x10, x3, x10
	adds	x6, x6, x4
	mul	x4, x3, x11
	umulh	x11, x3, x11
	adcs	x7, x7, x4
	mul	x4, x3, x12
	umulh	x12, x3, x12
	adcs	x8, x8, x4
	mul	x4, x3, x13
	umulh	x5, x3, x13
	adcs	x9, x9, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x7, x7, x10
	adcs	x8, x8, x11
	adcs	x9, x9, x12
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x9, #63
	mul	x5, x5, x3
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Reduce if top bit set
	and	x5, x3, x9, asr 63
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Store
	stp	x6, x7, [x0]
	stp	x8, x9, [x0, #16]
	ldr	x17, [x29, #24]
	ldr	x19, [x29, #32]
	ldp	x20, x21, [x29, #40]
	ldr	x22, [x29, #56]
	ldp	x29, x30, [sp], #0x40
	ret
	.size	fe_mul,.-fe_mul
	.text
	.align	2
	.globl	fe_sq
	.type	fe_sq, %function
fe_sq:
	# Square
	ldp	x13, x14, [x1]
	ldp	x15, x16, [x1, #16]
	#  A[0] * A[1]
	mul	x6, x13, x14
	umulh	x7, x13, x14
	#  A[0] * A[2]
	mul	x2, x13, x15
	umulh	x8, x13, x15
	adds	x7, x7, x2
	adc	x8, x8, xzr
	#  A[0] * A[3]
	mul	x2, x13, x16
	umulh	x9, x13, x16
	adds	x8, x8, x2
	adc	x9, x9, xzr
	#  A[1] * A[2]
	mul	x2, x14, x15
	umulh	x3, x14, x15
	adds	x8, x8, x2
	adcs	x9, x9, x3
	adc	x10, xzr, xzr
	#  A[1] * A[3]
	mul	x2, x14, x16
	umulh	x3, x14, x16
	adds	x9, x9, x2
	adc	x10, x10, x3
	#  A[2] * A[3]
	mul	x2, x15, x16
	umulh	x11, x15, x16
	adds	x10, x10, x2
	adc	x11, x11, xzr
	# Double
	adds	x6, x6, x6
	adcs	x7, x7, x7
	adcs	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x10, x10, x10
	adcs	x11, x11, x11
	adc	x12, xzr, xzr
	#  A[0] * A[0]
	mul	x5, x13, x13
	umulh	x4, x13, x13
	#  A[1] * A[1]
	mul	x2, x14, x14
	umulh	x3, x14, x14
	adds	x6, x6, x4
	adcs	x7, x7, x2
	adc	x4, x3, xzr
	#  A[2] * A[2]
	mul	x2, x15, x15
	umulh	x3, x15, x15
	adds	x8, x8, x4
	adcs	x9, x9, x2
	adc	x4, x3, xzr
	#  A[3] * A[3]
	mul	x2, x16, x16
	umulh	x3, x16, x16
	adds	x10, x10, x4
	adcs	x11, x11, x2
	adc	x12, x12, x3
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x12, x12, x11, #63
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	and	x8, x8, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x2, #19
	mul	x3, x2, x9
	umulh	x9, x2, x9
	adds	x5, x5, x3
	mul	x3, x2, x10
	umulh	x10, x2, x10
	adcs	x6, x6, x3
	mul	x3, x2, x11
	umulh	x11, x2, x11
	adcs	x7, x7, x3
	mul	x3, x2, x12
	umulh	x4, x2, x12
	adcs	x8, x8, x3
	adc	x4, x4, xzr
	#  Add remaining product results in
	adds	x6, x6, x9
	adcs	x7, x7, x10
	adcs	x8, x8, x11
	adc	x4, x4, xzr
	#  Overflow
	extr	x4, x4, x8, #63
	mul	x4, x4, x2
	and	x8, x8, #0x7fffffffffffffff
	adds	x5, x5, x4
	adcs	x6, x6, xzr
	adcs	x7, x7, xzr
	adc	x8, x8, xzr
	# Reduce if top bit set
	and	x4, x2, x8, asr 63
	and	x8, x8, #0x7fffffffffffffff
	adds	x5, x5, x4
	adcs	x6, x6, xzr
	adcs	x7, x7, xzr
	adc	x8, x8, xzr
	# Store
	stp	x5, x6, [x0]
	stp	x7, x8, [x0, #16]
	ret
	.size	fe_sq,.-fe_sq
	.text
	.align	2
	.globl	fe_invert
	.type	fe_invert, %function
fe_invert:
	stp	x29, x30, [sp, #-176]!
	add	x29, sp, #0
	str	x20, [x29, #168]
	# Invert
	str	x0, [x29, #144]
	str	x1, [x29, #152]
	add	x0, x29, #16
	bl	fe_sq
	add	x0, x29, #48
	add	x1, x29, #16
	bl	fe_sq
	add	x1, x29, #48
	bl	fe_sq
	ldr	x1, [x29, #152]
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #16
	add	x1, x29, #16
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #0x50
	bl	fe_sq
	add	x0, x29, #48
	add	x1, x29, #48
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #0x50
	bl	fe_sq
	mov	x20, #4
	add	x1, x29, #0x50
L_fe_invert1:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert1
	add	x0, x29, #48
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #0x50
	add	x1, x29, #48
	bl	fe_sq
	mov	x20, #9
	add	x1, x29, #0x50
L_fe_invert2:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert2
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #0x70
	bl	fe_sq
	mov	x20, #19
	add	x1, x29, #0x70
L_fe_invert3:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert3
	add	x0, x29, #0x50
	add	x2, x29, #0x50
	bl	fe_mul
	mov	x20, #10
	add	x1, x29, #0x50
L_fe_invert4:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert4
	add	x0, x29, #48
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #0x50
	add	x1, x29, #48
	bl	fe_sq
	mov	x20, #49
	add	x1, x29, #0x50
L_fe_invert5:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert5
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #0x70
	bl	fe_sq
	mov	x20, #0x63
	add	x1, x29, #0x70
L_fe_invert6:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert6
	add	x0, x29, #0x50
	add	x2, x29, #0x50
	bl	fe_mul
	mov	x20, #50
	add	x1, x29, #0x50
L_fe_invert7:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert7
	add	x0, x29, #48
	add	x2, x29, #48
	bl	fe_mul
	mov	x20, #5
	add	x1, x29, #48
L_fe_invert8:
	bl	fe_sq
	sub	x20, x20, #1
	cmp	x20, #0
	bne	L_fe_invert8
	ldr	x0, [x29, #144]
	add	x2, x29, #16
	bl	fe_mul
	ldr	x20, [x29, #168]
	ldp	x29, x30, [sp], #0xb0
	ret
	.size	fe_invert,.-fe_invert
	.text
	.align	2
	.globl	curve25519
	.type	curve25519, %function
curve25519:
	stp	x29, x30, [sp, #-288]!
	add	x29, sp, #0
	str	x17, [x29, #200]
	str	x19, [x29, #208]
	stp	x20, x21, [x29, #216]
	stp	x22, x23, [x29, #232]
	stp	x24, x25, [x29, #248]
	stp	x26, x27, [x29, #264]
	str	x28, [x29, #280]
	mov	x23, xzr
	str	x0, [x29, #176]
	str	x2, [x29, #184]
	# Copy
	ldp	x6, x7, [x2]
	ldp	x8, x9, [x2, #16]
	stp	x6, x7, [x29, #80]
	stp	x8, x9, [x29, #96]
	# Set one
	mov	x2, #1
	stp	x2, xzr, [x0]
	stp	xzr, xzr, [x0, #16]
	# Set zero
	stp	xzr, xzr, [x29, #16]
	stp	xzr, xzr, [x29, #32]
	# Set one
	mov	x2, #1
	stp	x2, xzr, [x29, #48]
	stp	xzr, xzr, [x29, #64]
	mov	x25, #62
	mov	x24, #24
L_curve25519_words:
L_curve25519_bits:
	ldr	x2, [x1, x24]
	lsr	x2, x2, x25
	and	x2, x2, #1
	eor	x23, x23, x2
	# Conditional Swap
	cmp	x23, #1
	ldp	x10, x11, [x0]
	ldp	x12, x13, [x0, #16]
	ldp	x6, x7, [x29, #80]
	ldp	x8, x9, [x29, #96]
	csel	x14, x10, x6, eq
	csel	x10, x6, x10, eq
	csel	x15, x11, x7, eq
	csel	x11, x7, x11, eq
	csel	x16, x12, x8, eq
	csel	x12, x8, x12, eq
	csel	x17, x13, x9, eq
	csel	x13, x9, x13, eq
	# Conditional Swap
	cmp	x23, #1
	ldp	x19, x20, [x29, #16]
	ldp	x21, x22, [x29, #32]
	ldp	x6, x7, [x29, #48]
	ldp	x8, x9, [x29, #64]
	csel	x5, x19, x6, eq
	csel	x19, x6, x19, eq
	csel	x26, x20, x7, eq
	csel	x20, x7, x20, eq
	csel	x27, x21, x8, eq
	csel	x21, x8, x21, eq
	csel	x28, x22, x9, eq
	csel	x22, x9, x22, eq
	mov	x23, x2
	# Add
	adds	x6, x10, x19
	adcs	x7, x11, x20
	adcs	x8, x12, x21
	adc	x9, x13, x22
	mov	x3, #-19
	asr	x2, x9, #63
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x6, x6, x3
	sbcs	x7, x7, x2
	sbcs	x8, x8, x2
	sbc	x9, x9, x4
	# Sub
	subs	x19, x10, x19
	sbcs	x20, x11, x20
	sbcs	x21, x12, x21
	sbcs	x22, x13, x22
	mov	x3, #-19
	csetm	x2, cc
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x19, x19, x3
	adcs	x20, x20, x2
	adcs	x21, x21, x2
	adc	x22, x22, x4
	stp	x19, x20, [x29, #144]
	stp	x21, x22, [x29, #160]
	# Add
	adds	x10, x14, x5
	adcs	x11, x15, x26
	adcs	x12, x16, x27
	adc	x13, x17, x28
	mov	x3, #-19
	asr	x2, x13, #63
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x10, x10, x3
	sbcs	x11, x11, x2
	sbcs	x12, x12, x2
	sbc	x13, x13, x4
	# Sub
	subs	x14, x14, x5
	sbcs	x15, x15, x26
	sbcs	x16, x16, x27
	sbcs	x17, x17, x28
	mov	x3, #-19
	csetm	x2, cc
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x14, x14, x3
	adcs	x15, x15, x2
	adcs	x16, x16, x2
	adc	x17, x17, x4
	# Multiply
	#  A[0] * B[0]
	mul	x19, x14, x6
	umulh	x20, x14, x6
	#  A[0] * B[1]
	mul	x3, x14, x7
	umulh	x21, x14, x7
	adds	x20, x20, x3
	adc	x21, x21, xzr
	#  A[1] * B[0]
	mul	x3, x15, x6
	umulh	x4, x15, x6
	adds	x20, x20, x3
	adcs	x21, x21, x4
	adc	x22, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x14, x8
	umulh	x4, x14, x8
	adds	x21, x21, x3
	adc	x22, x22, x4
	#  A[1] * B[1]
	mul	x3, x15, x7
	umulh	x4, x15, x7
	adds	x21, x21, x3
	adcs	x22, x22, x4
	adc	x2, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x16, x6
	umulh	x4, x16, x6
	adds	x21, x21, x3
	adcs	x22, x22, x4
	adc	x2, x2, xzr
	#  A[0] * B[3]
	mul	x3, x14, x9
	umulh	x4, x14, x9
	adds	x22, x22, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x15, x8
	umulh	x4, x15, x8
	adds	x22, x22, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[2] * B[1]
	mul	x3, x16, x7
	umulh	x4, x16, x7
	adds	x22, x22, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[3] * B[0]
	mul	x3, x17, x6
	umulh	x4, x17, x6
	adds	x22, x22, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[1] * B[3]
	mul	x3, x15, x9
	umulh	x4, x15, x9
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x16, x8
	umulh	x4, x16, x8
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[3] * B[1]
	mul	x3, x17, x7
	umulh	x4, x17, x7
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[2] * B[3]
	mul	x3, x16, x9
	umulh	x4, x16, x9
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x17, x8
	umulh	x4, x17, x8
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, x28, xzr
	#  A[3] * B[3]
	mul	x3, x17, x9
	umulh	x4, x17, x9
	adds	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x22, #63
	and	x22, x22, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x19, x19, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x20, x20, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x21, x21, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x22, x22, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x20, x20, x2
	adcs	x21, x21, x26
	adcs	x22, x22, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x22, #63
	mul	x5, x5, x3
	and	x22, x22, #0x7fffffffffffffff
	adds	x19, x19, x5
	adcs	x20, x20, xzr
	adcs	x21, x21, xzr
	adc	x22, x22, xzr
	# Reduce if top bit set
	and	x5, x3, x22, asr 63
	and	x22, x22, #0x7fffffffffffffff
	adds	x19, x19, x5
	adcs	x20, x20, xzr
	adcs	x21, x21, xzr
	adc	x22, x22, xzr
	# Store
	stp	x19, x20, [x29, #112]
	stp	x21, x22, [x29, #128]
	# Multiply
	ldp	x2, x26, [x29, #144]
	ldp	x27, x28, [x29, #160]
	#  A[0] * B[0]
	mul	x19, x10, x2
	umulh	x20, x10, x2
	#  A[0] * B[1]
	mul	x3, x10, x26
	umulh	x21, x10, x26
	adds	x20, x20, x3
	adc	x21, x21, xzr
	#  A[1] * B[0]
	mul	x3, x11, x2
	umulh	x4, x11, x2
	adds	x20, x20, x3
	adcs	x21, x21, x4
	adc	x22, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x10, x27
	umulh	x4, x10, x27
	adds	x21, x21, x3
	adc	x22, x22, x4
	#  A[1] * B[1]
	mul	x3, x11, x26
	umulh	x4, x11, x26
	adds	x21, x21, x3
	adcs	x22, x22, x4
	adc	x14, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x12, x2
	umulh	x4, x12, x2
	adds	x21, x21, x3
	adcs	x22, x22, x4
	adc	x14, x14, xzr
	#  A[0] * B[3]
	mul	x3, x10, x28
	umulh	x4, x10, x28
	adds	x22, x22, x3
	adcs	x14, x14, x4
	adc	x15, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x11, x27
	umulh	x4, x11, x27
	adds	x22, x22, x3
	adcs	x14, x14, x4
	adc	x15, x15, xzr
	#  A[2] * B[1]
	mul	x3, x12, x26
	umulh	x4, x12, x26
	adds	x22, x22, x3
	adcs	x14, x14, x4
	adc	x15, x15, xzr
	#  A[3] * B[0]
	mul	x3, x13, x2
	umulh	x4, x13, x2
	adds	x22, x22, x3
	adcs	x14, x14, x4
	adc	x15, x15, xzr
	#  A[1] * B[3]
	mul	x3, x11, x28
	umulh	x4, x11, x28
	adds	x14, x14, x3
	adcs	x15, x15, x4
	adc	x16, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x12, x27
	umulh	x4, x12, x27
	adds	x14, x14, x3
	adcs	x15, x15, x4
	adc	x16, x16, xzr
	#  A[3] * B[1]
	mul	x3, x13, x26
	umulh	x4, x13, x26
	adds	x14, x14, x3
	adcs	x15, x15, x4
	adc	x16, x16, xzr
	#  A[2] * B[3]
	mul	x3, x12, x28
	umulh	x4, x12, x28
	adds	x15, x15, x3
	adcs	x16, x16, x4
	adc	x17, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x13, x27
	umulh	x4, x13, x27
	adds	x15, x15, x3
	adcs	x16, x16, x4
	adc	x17, x17, xzr
	#  A[3] * B[3]
	mul	x3, x13, x28
	umulh	x4, x13, x28
	adds	x16, x16, x3
	adc	x17, x17, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x17, x17, x16, #63
	extr	x16, x16, x15, #63
	extr	x15, x15, x14, #63
	extr	x14, x14, x22, #63
	and	x22, x22, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x14
	umulh	x14, x3, x14
	adds	x19, x19, x4
	mul	x4, x3, x15
	umulh	x15, x3, x15
	adcs	x20, x20, x4
	mul	x4, x3, x16
	umulh	x16, x3, x16
	adcs	x21, x21, x4
	mul	x4, x3, x17
	umulh	x5, x3, x17
	adcs	x22, x22, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x20, x20, x14
	adcs	x21, x21, x15
	adcs	x22, x22, x16
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x22, #63
	mul	x5, x5, x3
	and	x22, x22, #0x7fffffffffffffff
	adds	x19, x19, x5
	adcs	x20, x20, xzr
	adcs	x21, x21, xzr
	adc	x22, x22, xzr
	# Reduce if top bit set
	and	x5, x3, x22, asr 63
	and	x22, x22, #0x7fffffffffffffff
	adds	x19, x19, x5
	adcs	x20, x20, xzr
	adcs	x21, x21, xzr
	adc	x22, x22, xzr
	# Store
	# Square
	#  A[0] * A[1]
	mul	x11, x2, x26
	umulh	x12, x2, x26
	#  A[0] * A[2]
	mul	x3, x2, x27
	umulh	x13, x2, x27
	adds	x12, x12, x3
	adc	x13, x13, xzr
	#  A[0] * A[3]
	mul	x3, x2, x28
	umulh	x14, x2, x28
	adds	x13, x13, x3
	adc	x14, x14, xzr
	#  A[1] * A[2]
	mul	x3, x26, x27
	umulh	x4, x26, x27
	adds	x13, x13, x3
	adcs	x14, x14, x4
	adc	x15, xzr, xzr
	#  A[1] * A[3]
	mul	x3, x26, x28
	umulh	x4, x26, x28
	adds	x14, x14, x3
	adc	x15, x15, x4
	#  A[2] * A[3]
	mul	x3, x27, x28
	umulh	x16, x27, x28
	adds	x15, x15, x3
	adc	x16, x16, xzr
	# Double
	adds	x11, x11, x11
	adcs	x12, x12, x12
	adcs	x13, x13, x13
	adcs	x14, x14, x14
	adcs	x15, x15, x15
	adcs	x16, x16, x16
	adc	x17, xzr, xzr
	#  A[0] * A[0]
	mul	x10, x2, x2
	umulh	x5, x2, x2
	#  A[1] * A[1]
	mul	x3, x26, x26
	umulh	x4, x26, x26
	adds	x11, x11, x5
	adcs	x12, x12, x3
	adc	x5, x4, xzr
	#  A[2] * A[2]
	mul	x3, x27, x27
	umulh	x4, x27, x27
	adds	x13, x13, x5
	adcs	x14, x14, x3
	adc	x5, x4, xzr
	#  A[3] * A[3]
	mul	x3, x28, x28
	umulh	x4, x28, x28
	adds	x15, x15, x5
	adcs	x16, x16, x3
	adc	x17, x17, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x17, x17, x16, #63
	extr	x16, x16, x15, #63
	extr	x15, x15, x14, #63
	extr	x14, x14, x13, #63
	and	x13, x13, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x14
	umulh	x14, x3, x14
	adds	x10, x10, x4
	mul	x4, x3, x15
	umulh	x15, x3, x15
	adcs	x11, x11, x4
	mul	x4, x3, x16
	umulh	x16, x3, x16
	adcs	x12, x12, x4
	mul	x4, x3, x17
	umulh	x5, x3, x17
	adcs	x13, x13, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x11, x11, x14
	adcs	x12, x12, x15
	adcs	x13, x13, x16
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x13, #63
	mul	x5, x5, x3
	and	x13, x13, #0x7fffffffffffffff
	adds	x10, x10, x5
	adcs	x11, x11, xzr
	adcs	x12, x12, xzr
	adc	x13, x13, xzr
	# Reduce if top bit set
	and	x5, x3, x13, asr 63
	and	x13, x13, #0x7fffffffffffffff
	adds	x10, x10, x5
	adcs	x11, x11, xzr
	adcs	x12, x12, xzr
	adc	x13, x13, xzr
	# Store
	# Square
	#  A[0] * A[1]
	mul	x15, x6, x7
	umulh	x16, x6, x7
	#  A[0] * A[2]
	mul	x3, x6, x8
	umulh	x17, x6, x8
	adds	x16, x16, x3
	adc	x17, x17, xzr
	#  A[0] * A[3]
	mul	x3, x6, x9
	umulh	x2, x6, x9
	adds	x17, x17, x3
	adc	x2, x2, xzr
	#  A[1] * A[2]
	mul	x3, x7, x8
	umulh	x4, x7, x8
	adds	x17, x17, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * A[3]
	mul	x3, x7, x9
	umulh	x4, x7, x9
	adds	x2, x2, x3
	adc	x26, x26, x4
	#  A[2] * A[3]
	mul	x3, x8, x9
	umulh	x27, x8, x9
	adds	x26, x26, x3
	adc	x27, x27, xzr
	# Double
	adds	x15, x15, x15
	adcs	x16, x16, x16
	adcs	x17, x17, x17
	adcs	x2, x2, x2
	adcs	x26, x26, x26
	adcs	x27, x27, x27
	adc	x28, xzr, xzr
	#  A[0] * A[0]
	mul	x14, x6, x6
	umulh	x5, x6, x6
	#  A[1] * A[1]
	mul	x3, x7, x7
	umulh	x4, x7, x7
	adds	x15, x15, x5
	adcs	x16, x16, x3
	adc	x5, x4, xzr
	#  A[2] * A[2]
	mul	x3, x8, x8
	umulh	x4, x8, x8
	adds	x17, x17, x5
	adcs	x2, x2, x3
	adc	x5, x4, xzr
	#  A[3] * A[3]
	mul	x3, x9, x9
	umulh	x4, x9, x9
	adds	x26, x26, x5
	adcs	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x17, #63
	and	x17, x17, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x14, x14, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x15, x15, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x16, x16, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x17, x17, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x15, x15, x2
	adcs	x16, x16, x26
	adcs	x17, x17, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x17, #63
	mul	x5, x5, x3
	and	x17, x17, #0x7fffffffffffffff
	adds	x14, x14, x5
	adcs	x15, x15, xzr
	adcs	x16, x16, xzr
	adc	x17, x17, xzr
	# Reduce if top bit set
	and	x5, x3, x17, asr 63
	and	x17, x17, #0x7fffffffffffffff
	adds	x14, x14, x5
	adcs	x15, x15, xzr
	adcs	x16, x16, xzr
	adc	x17, x17, xzr
	# Store
	# Multiply
	#  A[0] * B[0]
	mul	x6, x14, x10
	umulh	x7, x14, x10
	#  A[0] * B[1]
	mul	x3, x14, x11
	umulh	x8, x14, x11
	adds	x7, x7, x3
	adc	x8, x8, xzr
	#  A[1] * B[0]
	mul	x3, x15, x10
	umulh	x4, x15, x10
	adds	x7, x7, x3
	adcs	x8, x8, x4
	adc	x9, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x14, x12
	umulh	x4, x14, x12
	adds	x8, x8, x3
	adc	x9, x9, x4
	#  A[1] * B[1]
	mul	x3, x15, x11
	umulh	x4, x15, x11
	adds	x8, x8, x3
	adcs	x9, x9, x4
	adc	x2, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x16, x10
	umulh	x4, x16, x10
	adds	x8, x8, x3
	adcs	x9, x9, x4
	adc	x2, x2, xzr
	#  A[0] * B[3]
	mul	x3, x14, x13
	umulh	x4, x14, x13
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x15, x12
	umulh	x4, x15, x12
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[2] * B[1]
	mul	x3, x16, x11
	umulh	x4, x16, x11
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[3] * B[0]
	mul	x3, x17, x10
	umulh	x4, x17, x10
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[1] * B[3]
	mul	x3, x15, x13
	umulh	x4, x15, x13
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x16, x12
	umulh	x4, x16, x12
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[3] * B[1]
	mul	x3, x17, x11
	umulh	x4, x17, x11
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[2] * B[3]
	mul	x3, x16, x13
	umulh	x4, x16, x13
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x17, x12
	umulh	x4, x17, x12
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, x28, xzr
	#  A[3] * B[3]
	mul	x3, x17, x13
	umulh	x4, x17, x13
	adds	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x9, #63
	and	x9, x9, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x6, x6, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x7, x7, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x8, x8, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x9, x9, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x7, x7, x2
	adcs	x8, x8, x26
	adcs	x9, x9, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x9, #63
	mul	x5, x5, x3
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Reduce if top bit set
	and	x5, x3, x9, asr 63
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Store
	stp	x6, x7, [x0]
	stp	x8, x9, [x0, #16]
	# Sub
	subs	x14, x14, x10
	sbcs	x15, x15, x11
	sbcs	x16, x16, x12
	sbcs	x17, x17, x13
	mov	x3, #-19
	csetm	x2, cc
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x14, x14, x3
	adcs	x15, x15, x2
	adcs	x16, x16, x2
	adc	x17, x17, x4
	# Multiply by 121666
	mov	x5, #0xdb42
	movk	x5, #1, lsl 16
	mul	x6, x14, x5
	umulh	x7, x14, x5
	mul	x3, x15, x5
	umulh	x4, x15, x5
	adds	x7, x7, x3
	adc	x8, xzr, x4
	mul	x3, x16, x5
	umulh	x4, x16, x5
	adds	x8, x8, x3
	adc	x9, xzr, x4
	mul	x3, x17, x5
	umulh	x4, x17, x5
	adds	x9, x9, x3
	adc	x4, xzr, x4
	mov	x5, #19
	extr	x4, x4, x9, #63
	mul	x4, x4, x5
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x4
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Add
	adds	x10, x10, x6
	adcs	x11, x11, x7
	adcs	x12, x12, x8
	adc	x13, x13, x9
	mov	x3, #-19
	asr	x2, x13, #63
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x10, x10, x3
	sbcs	x11, x11, x2
	sbcs	x12, x12, x2
	sbc	x13, x13, x4
	# Multiply
	#  A[0] * B[0]
	mul	x6, x14, x10
	umulh	x7, x14, x10
	#  A[0] * B[1]
	mul	x3, x14, x11
	umulh	x8, x14, x11
	adds	x7, x7, x3
	adc	x8, x8, xzr
	#  A[1] * B[0]
	mul	x3, x15, x10
	umulh	x4, x15, x10
	adds	x7, x7, x3
	adcs	x8, x8, x4
	adc	x9, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x14, x12
	umulh	x4, x14, x12
	adds	x8, x8, x3
	adc	x9, x9, x4
	#  A[1] * B[1]
	mul	x3, x15, x11
	umulh	x4, x15, x11
	adds	x8, x8, x3
	adcs	x9, x9, x4
	adc	x2, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x16, x10
	umulh	x4, x16, x10
	adds	x8, x8, x3
	adcs	x9, x9, x4
	adc	x2, x2, xzr
	#  A[0] * B[3]
	mul	x3, x14, x13
	umulh	x4, x14, x13
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x15, x12
	umulh	x4, x15, x12
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[2] * B[1]
	mul	x3, x16, x11
	umulh	x4, x16, x11
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[3] * B[0]
	mul	x3, x17, x10
	umulh	x4, x17, x10
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[1] * B[3]
	mul	x3, x15, x13
	umulh	x4, x15, x13
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x16, x12
	umulh	x4, x16, x12
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[3] * B[1]
	mul	x3, x17, x11
	umulh	x4, x17, x11
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[2] * B[3]
	mul	x3, x16, x13
	umulh	x4, x16, x13
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x17, x12
	umulh	x4, x17, x12
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, x28, xzr
	#  A[3] * B[3]
	mul	x3, x17, x13
	umulh	x4, x17, x13
	adds	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x9, #63
	and	x9, x9, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x6, x6, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x7, x7, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x8, x8, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x9, x9, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x7, x7, x2
	adcs	x8, x8, x26
	adcs	x9, x9, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x9, #63
	mul	x5, x5, x3
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Reduce if top bit set
	and	x5, x3, x9, asr 63
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Store
	stp	x6, x7, [x29, #16]
	stp	x8, x9, [x29, #32]
	# Add
	ldp	x6, x7, [x29, #112]
	ldp	x8, x9, [x29, #128]
	adds	x10, x6, x19
	adcs	x11, x7, x20
	adcs	x12, x8, x21
	adc	x13, x9, x22
	mov	x3, #-19
	asr	x2, x13, #63
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x10, x10, x3
	sbcs	x11, x11, x2
	sbcs	x12, x12, x2
	sbc	x13, x13, x4
	# Sub
	subs	x19, x6, x19
	sbcs	x20, x7, x20
	sbcs	x21, x8, x21
	sbcs	x22, x9, x22
	mov	x3, #-19
	csetm	x2, cc
	#   Mask the modulus
	and	x3, x2, x3
	and	x4, x2, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x19, x19, x3
	adcs	x20, x20, x2
	adcs	x21, x21, x2
	adc	x22, x22, x4
	# Square
	#  A[0] * A[1]
	mul	x7, x10, x11
	umulh	x8, x10, x11
	#  A[0] * A[2]
	mul	x3, x10, x12
	umulh	x9, x10, x12
	adds	x8, x8, x3
	adc	x9, x9, xzr
	#  A[0] * A[3]
	mul	x3, x10, x13
	umulh	x2, x10, x13
	adds	x9, x9, x3
	adc	x2, x2, xzr
	#  A[1] * A[2]
	mul	x3, x11, x12
	umulh	x4, x11, x12
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * A[3]
	mul	x3, x11, x13
	umulh	x4, x11, x13
	adds	x2, x2, x3
	adc	x26, x26, x4
	#  A[2] * A[3]
	mul	x3, x12, x13
	umulh	x27, x12, x13
	adds	x26, x26, x3
	adc	x27, x27, xzr
	# Double
	adds	x7, x7, x7
	adcs	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x2, x2, x2
	adcs	x26, x26, x26
	adcs	x27, x27, x27
	adc	x28, xzr, xzr
	#  A[0] * A[0]
	mul	x6, x10, x10
	umulh	x5, x10, x10
	#  A[1] * A[1]
	mul	x3, x11, x11
	umulh	x4, x11, x11
	adds	x7, x7, x5
	adcs	x8, x8, x3
	adc	x5, x4, xzr
	#  A[2] * A[2]
	mul	x3, x12, x12
	umulh	x4, x12, x12
	adds	x9, x9, x5
	adcs	x2, x2, x3
	adc	x5, x4, xzr
	#  A[3] * A[3]
	mul	x3, x13, x13
	umulh	x4, x13, x13
	adds	x26, x26, x5
	adcs	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x9, #63
	and	x9, x9, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x6, x6, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x7, x7, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x8, x8, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x9, x9, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x7, x7, x2
	adcs	x8, x8, x26
	adcs	x9, x9, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x9, #63
	mul	x5, x5, x3
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Reduce if top bit set
	and	x5, x3, x9, asr 63
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Store
	stp	x6, x7, [x29, #80]
	stp	x8, x9, [x29, #96]
	# Square
	#  A[0] * A[1]
	mul	x7, x19, x20
	umulh	x8, x19, x20
	#  A[0] * A[2]
	mul	x3, x19, x21
	umulh	x9, x19, x21
	adds	x8, x8, x3
	adc	x9, x9, xzr
	#  A[0] * A[3]
	mul	x3, x19, x22
	umulh	x2, x19, x22
	adds	x9, x9, x3
	adc	x2, x2, xzr
	#  A[1] * A[2]
	mul	x3, x20, x21
	umulh	x4, x20, x21
	adds	x9, x9, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * A[3]
	mul	x3, x20, x22
	umulh	x4, x20, x22
	adds	x2, x2, x3
	adc	x26, x26, x4
	#  A[2] * A[3]
	mul	x3, x21, x22
	umulh	x27, x21, x22
	adds	x26, x26, x3
	adc	x27, x27, xzr
	# Double
	adds	x7, x7, x7
	adcs	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x2, x2, x2
	adcs	x26, x26, x26
	adcs	x27, x27, x27
	adc	x28, xzr, xzr
	#  A[0] * A[0]
	mul	x6, x19, x19
	umulh	x5, x19, x19
	#  A[1] * A[1]
	mul	x3, x20, x20
	umulh	x4, x20, x20
	adds	x7, x7, x5
	adcs	x8, x8, x3
	adc	x5, x4, xzr
	#  A[2] * A[2]
	mul	x3, x21, x21
	umulh	x4, x21, x21
	adds	x9, x9, x5
	adcs	x2, x2, x3
	adc	x5, x4, xzr
	#  A[3] * A[3]
	mul	x3, x22, x22
	umulh	x4, x22, x22
	adds	x26, x26, x5
	adcs	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x9, #63
	and	x9, x9, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x6, x6, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x7, x7, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x8, x8, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x9, x9, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x7, x7, x2
	adcs	x8, x8, x26
	adcs	x9, x9, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x9, #63
	mul	x5, x5, x3
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Reduce if top bit set
	and	x5, x3, x9, asr 63
	and	x9, x9, #0x7fffffffffffffff
	adds	x6, x6, x5
	adcs	x7, x7, xzr
	adcs	x8, x8, xzr
	adc	x9, x9, xzr
	# Store
	ldr	x2, [x29, #184]
	# Multiply
	ldp	x14, x15, [x2]
	ldp	x16, x17, [x2, #16]
	#  A[0] * B[0]
	mul	x10, x14, x6
	umulh	x11, x14, x6
	#  A[0] * B[1]
	mul	x3, x14, x7
	umulh	x12, x14, x7
	adds	x11, x11, x3
	adc	x12, x12, xzr
	#  A[1] * B[0]
	mul	x3, x15, x6
	umulh	x4, x15, x6
	adds	x11, x11, x3
	adcs	x12, x12, x4
	adc	x13, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x14, x8
	umulh	x4, x14, x8
	adds	x12, x12, x3
	adc	x13, x13, x4
	#  A[1] * B[1]
	mul	x3, x15, x7
	umulh	x4, x15, x7
	adds	x12, x12, x3
	adcs	x13, x13, x4
	adc	x2, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x16, x6
	umulh	x4, x16, x6
	adds	x12, x12, x3
	adcs	x13, x13, x4
	adc	x2, x2, xzr
	#  A[0] * B[3]
	mul	x3, x14, x9
	umulh	x4, x14, x9
	adds	x13, x13, x3
	adcs	x2, x2, x4
	adc	x26, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x15, x8
	umulh	x4, x15, x8
	adds	x13, x13, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[2] * B[1]
	mul	x3, x16, x7
	umulh	x4, x16, x7
	adds	x13, x13, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[3] * B[0]
	mul	x3, x17, x6
	umulh	x4, x17, x6
	adds	x13, x13, x3
	adcs	x2, x2, x4
	adc	x26, x26, xzr
	#  A[1] * B[3]
	mul	x3, x15, x9
	umulh	x4, x15, x9
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x16, x8
	umulh	x4, x16, x8
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[3] * B[1]
	mul	x3, x17, x7
	umulh	x4, x17, x7
	adds	x2, x2, x3
	adcs	x26, x26, x4
	adc	x27, x27, xzr
	#  A[2] * B[3]
	mul	x3, x16, x9
	umulh	x4, x16, x9
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x17, x8
	umulh	x4, x17, x8
	adds	x26, x26, x3
	adcs	x27, x27, x4
	adc	x28, x28, xzr
	#  A[3] * B[3]
	mul	x3, x17, x9
	umulh	x4, x17, x9
	adds	x27, x27, x3
	adc	x28, x28, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x28, x28, x27, #63
	extr	x27, x27, x26, #63
	extr	x26, x26, x2, #63
	extr	x2, x2, x13, #63
	and	x13, x13, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x2
	umulh	x2, x3, x2
	adds	x10, x10, x4
	mul	x4, x3, x26
	umulh	x26, x3, x26
	adcs	x11, x11, x4
	mul	x4, x3, x27
	umulh	x27, x3, x27
	adcs	x12, x12, x4
	mul	x4, x3, x28
	umulh	x5, x3, x28
	adcs	x13, x13, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x11, x11, x2
	adcs	x12, x12, x26
	adcs	x13, x13, x27
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x13, #63
	mul	x5, x5, x3
	and	x13, x13, #0x7fffffffffffffff
	adds	x10, x10, x5
	adcs	x11, x11, xzr
	adcs	x12, x12, xzr
	adc	x13, x13, xzr
	# Reduce if top bit set
	and	x5, x3, x13, asr 63
	and	x13, x13, #0x7fffffffffffffff
	adds	x10, x10, x5
	adcs	x11, x11, xzr
	adcs	x12, x12, xzr
	adc	x13, x13, xzr
	# Store
	stp	x10, x11, [x29, #48]
	stp	x12, x13, [x29, #64]
	sub	x25, x25, #1
	cmp	x25, #0
	bge	L_curve25519_bits
	mov	x25, #63
	sub	x24, x24, #8
	cmp	x24, #0
	bge	L_curve25519_words
	# Invert
	add	x0, x29, #48
	add	x1, x29, #16
	bl	fe_sq
	add	x0, x29, #0x50
	add	x1, x29, #48
	bl	fe_sq
	add	x1, x29, #0x50
	bl	fe_sq
	add	x1, x29, #16
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #48
	add	x1, x29, #48
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #0x70
	bl	fe_sq
	add	x0, x29, #0x50
	add	x1, x29, #0x50
	add	x2, x29, #0x70
	bl	fe_mul
	add	x0, x29, #0x70
	bl	fe_sq
	mov	x24, #4
	add	x1, x29, #0x70
L_curve25519_inv_1:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_1
	add	x0, x29, #0x50
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #0x70
	add	x1, x29, #0x50
	bl	fe_sq
	mov	x24, #9
	add	x1, x29, #0x70
L_curve25519_inv_2:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_2
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #0x90
	bl	fe_sq
	mov	x24, #19
	add	x1, x29, #0x90
L_curve25519_inv_3:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_3
	add	x0, x29, #0x70
	add	x2, x29, #0x70
	bl	fe_mul
	mov	x24, #10
	add	x1, x29, #0x70
L_curve25519_inv_4:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_4
	add	x0, x29, #0x50
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #0x70
	add	x1, x29, #0x50
	bl	fe_sq
	mov	x24, #49
	add	x1, x29, #0x70
L_curve25519_inv_5:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_5
	add	x2, x29, #0x50
	bl	fe_mul
	add	x0, x29, #0x90
	bl	fe_sq
	mov	x24, #0x63
	add	x1, x29, #0x90
L_curve25519_inv_6:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_6
	add	x0, x29, #0x70
	add	x2, x29, #0x70
	bl	fe_mul
	mov	x24, #50
	add	x1, x29, #0x70
L_curve25519_inv_7:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_7
	add	x0, x29, #0x50
	add	x2, x29, #0x50
	bl	fe_mul
	mov	x24, #5
	add	x1, x29, #0x50
L_curve25519_inv_8:
	bl	fe_sq
	sub	x24, x24, #1
	cmp	x24, #0
	bne	L_curve25519_inv_8
	add	x0, x29, #16
	add	x2, x29, #48
	bl	fe_mul
	ldr	x0, [x29, #176]
	# Multiply
	ldp	x6, x7, [x0]
	ldp	x8, x9, [x0, #16]
	ldp	x10, x11, [x29, #16]
	ldp	x12, x13, [x29, #32]
	#  A[0] * B[0]
	mul	x14, x6, x10
	umulh	x15, x6, x10
	#  A[0] * B[1]
	mul	x3, x6, x11
	umulh	x16, x6, x11
	adds	x15, x15, x3
	adc	x16, x16, xzr
	#  A[1] * B[0]
	mul	x3, x7, x10
	umulh	x4, x7, x10
	adds	x15, x15, x3
	adcs	x16, x16, x4
	adc	x17, xzr, xzr
	#  A[0] * B[2]
	mul	x3, x6, x12
	umulh	x4, x6, x12
	adds	x16, x16, x3
	adc	x17, x17, x4
	#  A[1] * B[1]
	mul	x3, x7, x11
	umulh	x4, x7, x11
	adds	x16, x16, x3
	adcs	x17, x17, x4
	adc	x19, xzr, xzr
	#  A[2] * B[0]
	mul	x3, x8, x10
	umulh	x4, x8, x10
	adds	x16, x16, x3
	adcs	x17, x17, x4
	adc	x19, x19, xzr
	#  A[0] * B[3]
	mul	x3, x6, x13
	umulh	x4, x6, x13
	adds	x17, x17, x3
	adcs	x19, x19, x4
	adc	x20, xzr, xzr
	#  A[1] * B[2]
	mul	x3, x7, x12
	umulh	x4, x7, x12
	adds	x17, x17, x3
	adcs	x19, x19, x4
	adc	x20, x20, xzr
	#  A[2] * B[1]
	mul	x3, x8, x11
	umulh	x4, x8, x11
	adds	x17, x17, x3
	adcs	x19, x19, x4
	adc	x20, x20, xzr
	#  A[3] * B[0]
	mul	x3, x9, x10
	umulh	x4, x9, x10
	adds	x17, x17, x3
	adcs	x19, x19, x4
	adc	x20, x20, xzr
	#  A[1] * B[3]
	mul	x3, x7, x13
	umulh	x4, x7, x13
	adds	x19, x19, x3
	adcs	x20, x20, x4
	adc	x21, xzr, xzr
	#  A[2] * B[2]
	mul	x3, x8, x12
	umulh	x4, x8, x12
	adds	x19, x19, x3
	adcs	x20, x20, x4
	adc	x21, x21, xzr
	#  A[3] * B[1]
	mul	x3, x9, x11
	umulh	x4, x9, x11
	adds	x19, x19, x3
	adcs	x20, x20, x4
	adc	x21, x21, xzr
	#  A[2] * B[3]
	mul	x3, x8, x13
	umulh	x4, x8, x13
	adds	x20, x20, x3
	adcs	x21, x21, x4
	adc	x22, xzr, xzr
	#  A[3] * B[2]
	mul	x3, x9, x12
	umulh	x4, x9, x12
	adds	x20, x20, x3
	adcs	x21, x21, x4
	adc	x22, x22, xzr
	#  A[3] * B[3]
	mul	x3, x9, x13
	umulh	x4, x9, x13
	adds	x21, x21, x3
	adc	x22, x22, x4
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x22, x22, x21, #63
	extr	x21, x21, x20, #63
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	and	x17, x17, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x3, #19
	mul	x4, x3, x19
	umulh	x19, x3, x19
	adds	x14, x14, x4
	mul	x4, x3, x20
	umulh	x20, x3, x20
	adcs	x15, x15, x4
	mul	x4, x3, x21
	umulh	x21, x3, x21
	adcs	x16, x16, x4
	mul	x4, x3, x22
	umulh	x5, x3, x22
	adcs	x17, x17, x4
	adc	x5, x5, xzr
	#  Add remaining product results in
	adds	x15, x15, x19
	adcs	x16, x16, x20
	adcs	x17, x17, x21
	adc	x5, x5, xzr
	#  Overflow
	extr	x5, x5, x17, #63
	mul	x5, x5, x3
	and	x17, x17, #0x7fffffffffffffff
	adds	x14, x14, x5
	adcs	x15, x15, xzr
	adcs	x16, x16, xzr
	adc	x17, x17, xzr
	# Reduce if top bit set
	and	x5, x3, x17, asr 63
	and	x17, x17, #0x7fffffffffffffff
	adds	x14, x14, x5
	adcs	x15, x15, xzr
	adcs	x16, x16, xzr
	adc	x17, x17, xzr
	# Store
	stp	x14, x15, [x0]
	stp	x16, x17, [x0, #16]
	mov	x0, xzr
	ldr	x17, [x29, #200]
	ldr	x19, [x29, #208]
	ldp	x20, x21, [x29, #216]
	ldp	x22, x23, [x29, #232]
	ldp	x24, x25, [x29, #248]
	ldp	x26, x27, [x29, #264]
	ldr	x28, [x29, #280]
	ldp	x29, x30, [sp], #0x120
	ret
	.size	curve25519,.-curve25519
	.text
	.align	2
	.globl	fe_pow22523
	.type	fe_pow22523, %function
fe_pow22523:
	stp	x29, x30, [sp, #-144]!
	add	x29, sp, #0
	str	x21, [x29, #136]
	# pow22523
	str	x0, [x29, #112]
	str	x1, [x29, #120]
	add	x0, x29, #16
	bl	fe_sq
	add	x0, x29, #48
	add	x1, x29, #16
	bl	fe_sq
	add	x1, x29, #48
	bl	fe_sq
	ldr	x1, [x29, #120]
	add	x2, x29, #48
	bl	fe_mul
	add	x0, x29, #16
	add	x1, x29, #16
	add	x2, x29, #48
	bl	fe_mul
	bl	fe_sq
	add	x1, x29, #48
	add	x2, x29, #16
	bl	fe_mul
	add	x0, x29, #48
	add	x1, x29, #16
	bl	fe_sq
	mov	x21, #4
	add	x1, x29, #48
L_fe_pow22523_1:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_1
	add	x0, x29, #16
	add	x2, x29, #16
	bl	fe_mul
	add	x0, x29, #48
	add	x1, x29, #16
	bl	fe_sq
	mov	x21, #9
	add	x1, x29, #48
L_fe_pow22523_2:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_2
	add	x2, x29, #16
	bl	fe_mul
	add	x0, x29, #0x50
	bl	fe_sq
	mov	x21, #19
	add	x1, x29, #0x50
L_fe_pow22523_3:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_3
	add	x0, x29, #48
	add	x2, x29, #48
	bl	fe_mul
	mov	x21, #10
	add	x1, x29, #48
L_fe_pow22523_4:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_4
	add	x0, x29, #16
	add	x2, x29, #16
	bl	fe_mul
	add	x0, x29, #48
	add	x1, x29, #16
	bl	fe_sq
	mov	x21, #49
	add	x1, x29, #48
L_fe_pow22523_5:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_5
	add	x2, x29, #16
	bl	fe_mul
	add	x0, x29, #0x50
	bl	fe_sq
	mov	x21, #0x63
	add	x1, x29, #0x50
L_fe_pow22523_6:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_6
	add	x0, x29, #48
	add	x2, x29, #48
	bl	fe_mul
	mov	x21, #50
	add	x1, x29, #48
L_fe_pow22523_7:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_7
	add	x0, x29, #16
	add	x2, x29, #16
	bl	fe_mul
	mov	x21, #2
	add	x1, x29, #16
L_fe_pow22523_8:
	bl	fe_sq
	sub	x21, x21, #1
	cmp	x21, #0
	bne	L_fe_pow22523_8
	ldr	x0, [x29, #112]
	ldr	x2, [x29, #120]
	bl	fe_mul
	ldr	x21, [x29, #136]
	ldp	x29, x30, [sp], #0x90
	ret
	.size	fe_pow22523,.-fe_pow22523
	.text
	.align	2
	.globl	fe_ge_to_p2
	.type	fe_ge_to_p2, %function
fe_ge_to_p2:
	stp	x29, x30, [sp, #-112]!
	add	x29, sp, #0
	str	x17, [x29, #72]
	str	x19, [x29, #80]
	stp	x20, x21, [x29, #88]
	str	x22, [x29, #104]
	str	x1, [x29, #16]
	str	x2, [x29, #24]
	str	x3, [x29, #32]
	str	x4, [x29, #40]
	str	x5, [x29, #48]
	str	x6, [x29, #56]
	ldr	x1, [x29, #32]
	ldr	x2, [x29, #56]
	# Multiply
	ldp	x11, x12, [x1]
	ldp	x13, x14, [x1, #16]
	ldp	x15, x16, [x2]
	ldp	x17, x19, [x2, #16]
	#  A[0] * B[0]
	mul	x3, x11, x15
	umulh	x4, x11, x15
	#  A[0] * B[1]
	mul	x20, x11, x16
	umulh	x5, x11, x16
	adds	x4, x4, x20
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x20, x12, x15
	umulh	x21, x12, x15
	adds	x4, x4, x20
	adcs	x5, x5, x21
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x20, x11, x17
	umulh	x21, x11, x17
	adds	x5, x5, x20
	adc	x6, x6, x21
	#  A[1] * B[1]
	mul	x20, x12, x16
	umulh	x21, x12, x16
	adds	x5, x5, x20
	adcs	x6, x6, x21
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x20, x13, x15
	umulh	x21, x13, x15
	adds	x5, x5, x20
	adcs	x6, x6, x21
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x20, x11, x19
	umulh	x21, x11, x19
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x20, x12, x17
	umulh	x21, x12, x17
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x20, x13, x16
	umulh	x21, x13, x16
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x20, x14, x15
	umulh	x21, x14, x15
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x20, x12, x19
	umulh	x21, x12, x19
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x20, x13, x17
	umulh	x21, x13, x17
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x20, x14, x16
	umulh	x21, x14, x16
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x20, x13, x19
	umulh	x21, x13, x19
	adds	x8, x8, x20
	adcs	x9, x9, x21
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x20, x14, x17
	umulh	x21, x14, x17
	adds	x8, x8, x20
	adcs	x9, x9, x21
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x20, x14, x19
	umulh	x21, x14, x19
	adds	x9, x9, x20
	adc	x10, x10, x21
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x20, #19
	mul	x21, x20, x7
	umulh	x7, x20, x7
	adds	x3, x3, x21
	mul	x21, x20, x8
	umulh	x8, x20, x8
	adcs	x4, x4, x21
	mul	x21, x20, x9
	umulh	x9, x20, x9
	adcs	x5, x5, x21
	mul	x21, x20, x10
	umulh	x22, x20, x10
	adcs	x6, x6, x21
	adc	x22, x22, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x22, x22, xzr
	#  Overflow
	extr	x22, x22, x6, #63
	mul	x22, x22, x20
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x22
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x22, x20, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x22
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x0, [x29, #16]
	ldr	x1, [x29, #40]
	ldr	x2, [x29, #48]
	# Multiply
	ldp	x11, x12, [x1]
	ldp	x13, x14, [x1, #16]
	ldp	x15, x16, [x2]
	ldp	x17, x19, [x2, #16]
	#  A[0] * B[0]
	mul	x3, x11, x15
	umulh	x4, x11, x15
	#  A[0] * B[1]
	mul	x20, x11, x16
	umulh	x5, x11, x16
	adds	x4, x4, x20
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x20, x12, x15
	umulh	x21, x12, x15
	adds	x4, x4, x20
	adcs	x5, x5, x21
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x20, x11, x17
	umulh	x21, x11, x17
	adds	x5, x5, x20
	adc	x6, x6, x21
	#  A[1] * B[1]
	mul	x20, x12, x16
	umulh	x21, x12, x16
	adds	x5, x5, x20
	adcs	x6, x6, x21
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x20, x13, x15
	umulh	x21, x13, x15
	adds	x5, x5, x20
	adcs	x6, x6, x21
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x20, x11, x19
	umulh	x21, x11, x19
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x20, x12, x17
	umulh	x21, x12, x17
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x20, x13, x16
	umulh	x21, x13, x16
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x20, x14, x15
	umulh	x21, x14, x15
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x20, x12, x19
	umulh	x21, x12, x19
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x20, x13, x17
	umulh	x21, x13, x17
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x20, x14, x16
	umulh	x21, x14, x16
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x20, x13, x19
	umulh	x21, x13, x19
	adds	x8, x8, x20
	adcs	x9, x9, x21
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x20, x14, x17
	umulh	x21, x14, x17
	adds	x8, x8, x20
	adcs	x9, x9, x21
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x20, x14, x19
	umulh	x21, x14, x19
	adds	x9, x9, x20
	adc	x10, x10, x21
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x20, #19
	mul	x21, x20, x7
	umulh	x7, x20, x7
	adds	x3, x3, x21
	mul	x21, x20, x8
	umulh	x8, x20, x8
	adcs	x4, x4, x21
	mul	x21, x20, x9
	umulh	x9, x20, x9
	adcs	x5, x5, x21
	mul	x21, x20, x10
	umulh	x22, x20, x10
	adcs	x6, x6, x21
	adc	x22, x22, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x22, x22, xzr
	#  Overflow
	extr	x22, x22, x6, #63
	mul	x22, x22, x20
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x22
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x22, x20, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x22
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x0, [x29, #24]
	ldr	x2, [x29, #56]
	# Multiply
	ldp	x11, x12, [x2]
	ldp	x13, x14, [x2, #16]
	#  A[0] * B[0]
	mul	x3, x15, x11
	umulh	x4, x15, x11
	#  A[0] * B[1]
	mul	x20, x15, x12
	umulh	x5, x15, x12
	adds	x4, x4, x20
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x20, x16, x11
	umulh	x21, x16, x11
	adds	x4, x4, x20
	adcs	x5, x5, x21
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x20, x15, x13
	umulh	x21, x15, x13
	adds	x5, x5, x20
	adc	x6, x6, x21
	#  A[1] * B[1]
	mul	x20, x16, x12
	umulh	x21, x16, x12
	adds	x5, x5, x20
	adcs	x6, x6, x21
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x20, x17, x11
	umulh	x21, x17, x11
	adds	x5, x5, x20
	adcs	x6, x6, x21
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x20, x15, x14
	umulh	x21, x15, x14
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x20, x16, x13
	umulh	x21, x16, x13
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x20, x17, x12
	umulh	x21, x17, x12
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x20, x19, x11
	umulh	x21, x19, x11
	adds	x6, x6, x20
	adcs	x7, x7, x21
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x20, x16, x14
	umulh	x21, x16, x14
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x20, x17, x13
	umulh	x21, x17, x13
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x20, x19, x12
	umulh	x21, x19, x12
	adds	x7, x7, x20
	adcs	x8, x8, x21
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x20, x17, x14
	umulh	x21, x17, x14
	adds	x8, x8, x20
	adcs	x9, x9, x21
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x20, x19, x13
	umulh	x21, x19, x13
	adds	x8, x8, x20
	adcs	x9, x9, x21
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x20, x19, x14
	umulh	x21, x19, x14
	adds	x9, x9, x20
	adc	x10, x10, x21
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x20, #19
	mul	x21, x20, x7
	umulh	x7, x20, x7
	adds	x3, x3, x21
	mul	x21, x20, x8
	umulh	x8, x20, x8
	adcs	x4, x4, x21
	mul	x21, x20, x9
	umulh	x9, x20, x9
	adcs	x5, x5, x21
	mul	x21, x20, x10
	umulh	x22, x20, x10
	adcs	x6, x6, x21
	adc	x22, x22, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x22, x22, xzr
	#  Overflow
	extr	x22, x22, x6, #63
	mul	x22, x22, x20
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x22
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x22, x20, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x22
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x17, [x29, #72]
	ldr	x19, [x29, #80]
	ldp	x20, x21, [x29, #88]
	ldr	x22, [x29, #104]
	ldp	x29, x30, [sp], #0x70
	ret
	.size	fe_ge_to_p2,.-fe_ge_to_p2
	.text
	.align	2
	.globl	fe_ge_to_p3
	.type	fe_ge_to_p3, %function
fe_ge_to_p3:
	stp	x29, x30, [sp, #-160]!
	add	x29, sp, #0
	str	x17, [x29, #88]
	str	x19, [x29, #96]
	stp	x20, x21, [x29, #104]
	stp	x22, x23, [x29, #120]
	stp	x24, x25, [x29, #136]
	str	x26, [x29, #152]
	str	x1, [x29, #16]
	str	x2, [x29, #24]
	str	x3, [x29, #32]
	str	x4, [x29, #40]
	str	x5, [x29, #48]
	str	x6, [x29, #56]
	str	x7, [x29, #64]
	ldr	x1, [x29, #40]
	ldr	x2, [x29, #64]
	# Multiply
	ldp	x11, x12, [x1]
	ldp	x13, x14, [x1, #16]
	ldp	x15, x16, [x2]
	ldp	x17, x19, [x2, #16]
	#  A[0] * B[0]
	mul	x3, x11, x15
	umulh	x4, x11, x15
	#  A[0] * B[1]
	mul	x24, x11, x16
	umulh	x5, x11, x16
	adds	x4, x4, x24
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x24, x12, x15
	umulh	x25, x12, x15
	adds	x4, x4, x24
	adcs	x5, x5, x25
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x24, x11, x17
	umulh	x25, x11, x17
	adds	x5, x5, x24
	adc	x6, x6, x25
	#  A[1] * B[1]
	mul	x24, x12, x16
	umulh	x25, x12, x16
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x24, x13, x15
	umulh	x25, x13, x15
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x24, x11, x19
	umulh	x25, x11, x19
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x24, x12, x17
	umulh	x25, x12, x17
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x24, x13, x16
	umulh	x25, x13, x16
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x24, x14, x15
	umulh	x25, x14, x15
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x24, x12, x19
	umulh	x25, x12, x19
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x24, x13, x17
	umulh	x25, x13, x17
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x24, x14, x16
	umulh	x25, x14, x16
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x24, x13, x19
	umulh	x25, x13, x19
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x24, x14, x17
	umulh	x25, x14, x17
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x24, x14, x19
	umulh	x25, x14, x19
	adds	x9, x9, x24
	adc	x10, x10, x25
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x24, #19
	mul	x25, x24, x7
	umulh	x7, x24, x7
	adds	x3, x3, x25
	mul	x25, x24, x8
	umulh	x8, x24, x8
	adcs	x4, x4, x25
	mul	x25, x24, x9
	umulh	x9, x24, x9
	adcs	x5, x5, x25
	mul	x25, x24, x10
	umulh	x26, x24, x10
	adcs	x6, x6, x25
	adc	x26, x26, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x26, x26, xzr
	#  Overflow
	extr	x26, x26, x6, #63
	mul	x26, x26, x24
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x26, x24, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x0, [x29, #32]
	ldr	x2, [x29, #48]
	# Multiply
	ldp	x20, x21, [x2]
	ldp	x22, x23, [x2, #16]
	#  A[0] * B[0]
	mul	x3, x11, x20
	umulh	x4, x11, x20
	#  A[0] * B[1]
	mul	x24, x11, x21
	umulh	x5, x11, x21
	adds	x4, x4, x24
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x24, x12, x20
	umulh	x25, x12, x20
	adds	x4, x4, x24
	adcs	x5, x5, x25
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x24, x11, x22
	umulh	x25, x11, x22
	adds	x5, x5, x24
	adc	x6, x6, x25
	#  A[1] * B[1]
	mul	x24, x12, x21
	umulh	x25, x12, x21
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x24, x13, x20
	umulh	x25, x13, x20
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x24, x11, x23
	umulh	x25, x11, x23
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x24, x12, x22
	umulh	x25, x12, x22
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x24, x13, x21
	umulh	x25, x13, x21
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x24, x14, x20
	umulh	x25, x14, x20
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x24, x12, x23
	umulh	x25, x12, x23
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x24, x13, x22
	umulh	x25, x13, x22
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x24, x14, x21
	umulh	x25, x14, x21
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x24, x13, x23
	umulh	x25, x13, x23
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x24, x14, x22
	umulh	x25, x14, x22
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x24, x14, x23
	umulh	x25, x14, x23
	adds	x9, x9, x24
	adc	x10, x10, x25
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x24, #19
	mul	x25, x24, x7
	umulh	x7, x24, x7
	adds	x3, x3, x25
	mul	x25, x24, x8
	umulh	x8, x24, x8
	adcs	x4, x4, x25
	mul	x25, x24, x9
	umulh	x9, x24, x9
	adcs	x5, x5, x25
	mul	x25, x24, x10
	umulh	x26, x24, x10
	adcs	x6, x6, x25
	adc	x26, x26, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x26, x26, xzr
	#  Overflow
	extr	x26, x26, x6, #63
	mul	x26, x26, x24
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x26, x24, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x0, [x29, #16]
	ldr	x2, [x29, #56]
	# Multiply
	ldp	x11, x12, [x2]
	ldp	x13, x14, [x2, #16]
	#  A[0] * B[0]
	mul	x3, x20, x11
	umulh	x4, x20, x11
	#  A[0] * B[1]
	mul	x24, x20, x12
	umulh	x5, x20, x12
	adds	x4, x4, x24
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x24, x21, x11
	umulh	x25, x21, x11
	adds	x4, x4, x24
	adcs	x5, x5, x25
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x24, x20, x13
	umulh	x25, x20, x13
	adds	x5, x5, x24
	adc	x6, x6, x25
	#  A[1] * B[1]
	mul	x24, x21, x12
	umulh	x25, x21, x12
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x24, x22, x11
	umulh	x25, x22, x11
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x24, x20, x14
	umulh	x25, x20, x14
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x24, x21, x13
	umulh	x25, x21, x13
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x24, x22, x12
	umulh	x25, x22, x12
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x24, x23, x11
	umulh	x25, x23, x11
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x24, x21, x14
	umulh	x25, x21, x14
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x24, x22, x13
	umulh	x25, x22, x13
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x24, x23, x12
	umulh	x25, x23, x12
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x24, x22, x14
	umulh	x25, x22, x14
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x24, x23, x13
	umulh	x25, x23, x13
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x24, x23, x14
	umulh	x25, x23, x14
	adds	x9, x9, x24
	adc	x10, x10, x25
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x24, #19
	mul	x25, x24, x7
	umulh	x7, x24, x7
	adds	x3, x3, x25
	mul	x25, x24, x8
	umulh	x8, x24, x8
	adcs	x4, x4, x25
	mul	x25, x24, x9
	umulh	x9, x24, x9
	adcs	x5, x5, x25
	mul	x25, x24, x10
	umulh	x26, x24, x10
	adcs	x6, x6, x25
	adc	x26, x26, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x26, x26, xzr
	#  Overflow
	extr	x26, x26, x6, #63
	mul	x26, x26, x24
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x26, x24, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x0, [x29, #24]
	# Multiply
	#  A[0] * B[0]
	mul	x3, x11, x15
	umulh	x4, x11, x15
	#  A[0] * B[1]
	mul	x24, x11, x16
	umulh	x5, x11, x16
	adds	x4, x4, x24
	adc	x5, x5, xzr
	#  A[1] * B[0]
	mul	x24, x12, x15
	umulh	x25, x12, x15
	adds	x4, x4, x24
	adcs	x5, x5, x25
	adc	x6, xzr, xzr
	#  A[0] * B[2]
	mul	x24, x11, x17
	umulh	x25, x11, x17
	adds	x5, x5, x24
	adc	x6, x6, x25
	#  A[1] * B[1]
	mul	x24, x12, x16
	umulh	x25, x12, x16
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, xzr, xzr
	#  A[2] * B[0]
	mul	x24, x13, x15
	umulh	x25, x13, x15
	adds	x5, x5, x24
	adcs	x6, x6, x25
	adc	x7, x7, xzr
	#  A[0] * B[3]
	mul	x24, x11, x19
	umulh	x25, x11, x19
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, xzr, xzr
	#  A[1] * B[2]
	mul	x24, x12, x17
	umulh	x25, x12, x17
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[2] * B[1]
	mul	x24, x13, x16
	umulh	x25, x13, x16
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[3] * B[0]
	mul	x24, x14, x15
	umulh	x25, x14, x15
	adds	x6, x6, x24
	adcs	x7, x7, x25
	adc	x8, x8, xzr
	#  A[1] * B[3]
	mul	x24, x12, x19
	umulh	x25, x12, x19
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, xzr, xzr
	#  A[2] * B[2]
	mul	x24, x13, x17
	umulh	x25, x13, x17
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[3] * B[1]
	mul	x24, x14, x16
	umulh	x25, x14, x16
	adds	x7, x7, x24
	adcs	x8, x8, x25
	adc	x9, x9, xzr
	#  A[2] * B[3]
	mul	x24, x13, x19
	umulh	x25, x13, x19
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, xzr, xzr
	#  A[3] * B[2]
	mul	x24, x14, x17
	umulh	x25, x14, x17
	adds	x8, x8, x24
	adcs	x9, x9, x25
	adc	x10, x10, xzr
	#  A[3] * B[3]
	mul	x24, x14, x19
	umulh	x25, x14, x19
	adds	x9, x9, x24
	adc	x10, x10, x25
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	extr	x7, x7, x6, #63
	and	x6, x6, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x24, #19
	mul	x25, x24, x7
	umulh	x7, x24, x7
	adds	x3, x3, x25
	mul	x25, x24, x8
	umulh	x8, x24, x8
	adcs	x4, x4, x25
	mul	x25, x24, x9
	umulh	x9, x24, x9
	adcs	x5, x5, x25
	mul	x25, x24, x10
	umulh	x26, x24, x10
	adcs	x6, x6, x25
	adc	x26, x26, xzr
	#  Add remaining product results in
	adds	x4, x4, x7
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adc	x26, x26, xzr
	#  Overflow
	extr	x26, x26, x6, #63
	mul	x26, x26, x24
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Reduce if top bit set
	and	x26, x24, x6, asr 63
	and	x6, x6, #0x7fffffffffffffff
	adds	x3, x3, x26
	adcs	x4, x4, xzr
	adcs	x5, x5, xzr
	adc	x6, x6, xzr
	# Store
	stp	x3, x4, [x0]
	stp	x5, x6, [x0, #16]
	ldr	x17, [x29, #88]
	ldr	x19, [x29, #96]
	ldp	x20, x21, [x29, #104]
	ldp	x22, x23, [x29, #120]
	ldp	x24, x25, [x29, #136]
	ldr	x26, [x29, #152]
	ldp	x29, x30, [sp], #0xa0
	ret
	.size	fe_ge_to_p3,.-fe_ge_to_p3
	.text
	.align	2
	.globl	fe_ge_dbl
	.type	fe_ge_dbl, %function
fe_ge_dbl:
	stp	x29, x30, [sp, #-176]!
	add	x29, sp, #0
	str	x17, [x29, #88]
	str	x19, [x29, #96]
	stp	x20, x21, [x29, #104]
	stp	x22, x23, [x29, #120]
	stp	x24, x25, [x29, #136]
	stp	x26, x27, [x29, #152]
	str	x28, [x29, #168]
	str	x0, [x29, #16]
	str	x1, [x29, #24]
	str	x2, [x29, #32]
	str	x3, [x29, #40]
	str	x4, [x29, #48]
	str	x5, [x29, #56]
	str	x6, [x29, #64]
	ldr	x1, [x29, #48]
	# Square
	ldp	x12, x13, [x1]
	ldp	x14, x15, [x1, #16]
	#  A[0] * A[1]
	mul	x5, x12, x13
	umulh	x6, x12, x13
	#  A[0] * A[2]
	mul	x25, x12, x14
	umulh	x7, x12, x14
	adds	x6, x6, x25
	adc	x7, x7, xzr
	#  A[0] * A[3]
	mul	x25, x12, x15
	umulh	x8, x12, x15
	adds	x7, x7, x25
	adc	x8, x8, xzr
	#  A[1] * A[2]
	mul	x25, x13, x14
	umulh	x26, x13, x14
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, xzr, xzr
	#  A[1] * A[3]
	mul	x25, x13, x15
	umulh	x26, x13, x15
	adds	x8, x8, x25
	adc	x9, x9, x26
	#  A[2] * A[3]
	mul	x25, x14, x15
	umulh	x10, x14, x15
	adds	x9, x9, x25
	adc	x10, x10, xzr
	# Double
	adds	x5, x5, x5
	adcs	x6, x6, x6
	adcs	x7, x7, x7
	adcs	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x10, x10, x10
	adc	x11, xzr, xzr
	#  A[0] * A[0]
	mul	x4, x12, x12
	umulh	x27, x12, x12
	#  A[1] * A[1]
	mul	x25, x13, x13
	umulh	x26, x13, x13
	adds	x5, x5, x27
	adcs	x6, x6, x25
	adc	x27, x26, xzr
	#  A[2] * A[2]
	mul	x25, x14, x14
	umulh	x26, x14, x14
	adds	x7, x7, x27
	adcs	x8, x8, x25
	adc	x27, x26, xzr
	#  A[3] * A[3]
	mul	x25, x15, x15
	umulh	x26, x15, x15
	adds	x9, x9, x27
	adcs	x10, x10, x25
	adc	x11, x11, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x8
	umulh	x8, x25, x8
	adds	x4, x4, x26
	mul	x26, x25, x9
	umulh	x9, x25, x9
	adcs	x5, x5, x26
	mul	x26, x25, x10
	umulh	x10, x25, x10
	adcs	x6, x6, x26
	mul	x26, x25, x11
	umulh	x27, x25, x11
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x8
	adcs	x6, x6, x9
	adcs	x7, x7, x10
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	stp	x4, x5, [x0]
	stp	x6, x7, [x0, #16]
	ldr	x0, [x29, #32]
	ldr	x1, [x29, #56]
	# Square
	ldp	x21, x22, [x1]
	ldp	x23, x24, [x1, #16]
	#  A[0] * A[1]
	mul	x9, x21, x22
	umulh	x10, x21, x22
	#  A[0] * A[2]
	mul	x25, x21, x23
	umulh	x11, x21, x23
	adds	x10, x10, x25
	adc	x11, x11, xzr
	#  A[0] * A[3]
	mul	x25, x21, x24
	umulh	x16, x21, x24
	adds	x11, x11, x25
	adc	x16, x16, xzr
	#  A[1] * A[2]
	mul	x25, x22, x23
	umulh	x26, x22, x23
	adds	x11, x11, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * A[3]
	mul	x25, x22, x24
	umulh	x26, x22, x24
	adds	x16, x16, x25
	adc	x17, x17, x26
	#  A[2] * A[3]
	mul	x25, x23, x24
	umulh	x19, x23, x24
	adds	x17, x17, x25
	adc	x19, x19, xzr
	# Double
	adds	x9, x9, x9
	adcs	x10, x10, x10
	adcs	x11, x11, x11
	adcs	x16, x16, x16
	adcs	x17, x17, x17
	adcs	x19, x19, x19
	adc	x20, xzr, xzr
	#  A[0] * A[0]
	mul	x8, x21, x21
	umulh	x27, x21, x21
	#  A[1] * A[1]
	mul	x25, x22, x22
	umulh	x26, x22, x22
	adds	x9, x9, x27
	adcs	x10, x10, x25
	adc	x27, x26, xzr
	#  A[2] * A[2]
	mul	x25, x23, x23
	umulh	x26, x23, x23
	adds	x11, x11, x27
	adcs	x16, x16, x25
	adc	x27, x26, xzr
	#  A[3] * A[3]
	mul	x25, x24, x24
	umulh	x26, x24, x24
	adds	x17, x17, x27
	adcs	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x11, #63
	and	x11, x11, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x8, x8, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x9, x9, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x10, x10, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x11, x11, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x9, x9, x16
	adcs	x10, x10, x17
	adcs	x11, x11, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x11, #63
	mul	x27, x27, x25
	and	x11, x11, #0x7fffffffffffffff
	adds	x8, x8, x27
	adcs	x9, x9, xzr
	adcs	x10, x10, xzr
	adc	x11, x11, xzr
	# Reduce if top bit set
	and	x27, x25, x11, asr 63
	and	x11, x11, #0x7fffffffffffffff
	adds	x8, x8, x27
	adcs	x9, x9, xzr
	adcs	x10, x10, xzr
	adc	x11, x11, xzr
	# Store
	stp	x8, x9, [x0]
	stp	x10, x11, [x0, #16]
	ldr	x0, [x29, #24]
	# Add
	adds	x12, x12, x21
	adcs	x13, x13, x22
	adcs	x14, x14, x23
	adc	x15, x15, x24
	mov	x25, #-19
	asr	x28, x15, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x12, x12, x25
	sbcs	x13, x13, x28
	sbcs	x14, x14, x28
	sbc	x15, x15, x26
	ldr	x0, [x29, #40]
	# Square
	#  A[0] * A[1]
	mul	x17, x12, x13
	umulh	x19, x12, x13
	#  A[0] * A[2]
	mul	x25, x12, x14
	umulh	x20, x12, x14
	adds	x19, x19, x25
	adc	x20, x20, xzr
	#  A[0] * A[3]
	mul	x25, x12, x15
	umulh	x21, x12, x15
	adds	x20, x20, x25
	adc	x21, x21, xzr
	#  A[1] * A[2]
	mul	x25, x13, x14
	umulh	x26, x13, x14
	adds	x20, x20, x25
	adcs	x21, x21, x26
	adc	x22, xzr, xzr
	#  A[1] * A[3]
	mul	x25, x13, x15
	umulh	x26, x13, x15
	adds	x21, x21, x25
	adc	x22, x22, x26
	#  A[2] * A[3]
	mul	x25, x14, x15
	umulh	x23, x14, x15
	adds	x22, x22, x25
	adc	x23, x23, xzr
	# Double
	adds	x17, x17, x17
	adcs	x19, x19, x19
	adcs	x20, x20, x20
	adcs	x21, x21, x21
	adcs	x22, x22, x22
	adcs	x23, x23, x23
	adc	x24, xzr, xzr
	#  A[0] * A[0]
	mul	x16, x12, x12
	umulh	x27, x12, x12
	#  A[1] * A[1]
	mul	x25, x13, x13
	umulh	x26, x13, x13
	adds	x17, x17, x27
	adcs	x19, x19, x25
	adc	x27, x26, xzr
	#  A[2] * A[2]
	mul	x25, x14, x14
	umulh	x26, x14, x14
	adds	x20, x20, x27
	adcs	x21, x21, x25
	adc	x27, x26, xzr
	#  A[3] * A[3]
	mul	x25, x15, x15
	umulh	x26, x15, x15
	adds	x22, x22, x27
	adcs	x23, x23, x25
	adc	x24, x24, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x24, x24, x23, #63
	extr	x23, x23, x22, #63
	extr	x22, x22, x21, #63
	extr	x21, x21, x20, #63
	and	x20, x20, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x21
	umulh	x21, x25, x21
	adds	x16, x16, x26
	mul	x26, x25, x22
	umulh	x22, x25, x22
	adcs	x17, x17, x26
	mul	x26, x25, x23
	umulh	x23, x25, x23
	adcs	x19, x19, x26
	mul	x26, x25, x24
	umulh	x27, x25, x24
	adcs	x20, x20, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x17, x17, x21
	adcs	x19, x19, x22
	adcs	x20, x20, x23
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x20, #63
	mul	x27, x27, x25
	and	x20, x20, #0x7fffffffffffffff
	adds	x16, x16, x27
	adcs	x17, x17, xzr
	adcs	x19, x19, xzr
	adc	x20, x20, xzr
	# Reduce if top bit set
	and	x27, x25, x20, asr 63
	and	x20, x20, #0x7fffffffffffffff
	adds	x16, x16, x27
	adcs	x17, x17, xzr
	adcs	x19, x19, xzr
	adc	x20, x20, xzr
	# Store
	stp	x16, x17, [x0]
	stp	x19, x20, [x0, #16]
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #32]
	# Add
	adds	x12, x8, x4
	adcs	x13, x9, x5
	adcs	x14, x10, x6
	adc	x15, x11, x7
	mov	x25, #-19
	asr	x28, x15, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x12, x12, x25
	sbcs	x13, x13, x28
	sbcs	x14, x14, x28
	sbc	x15, x15, x26
	# Sub
	subs	x21, x8, x4
	sbcs	x22, x9, x5
	sbcs	x23, x10, x6
	sbcs	x24, x11, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x21, x21, x25
	adcs	x22, x22, x28
	adcs	x23, x23, x28
	adc	x24, x24, x26
	stp	x12, x13, [x0]
	stp	x14, x15, [x0, #16]
	stp	x21, x22, [x1]
	stp	x23, x24, [x1, #16]
	ldr	x0, [x29, #16]
	# Sub
	subs	x16, x16, x12
	sbcs	x17, x17, x13
	sbcs	x19, x19, x14
	sbcs	x20, x20, x15
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x16, x17, [x0]
	stp	x19, x20, [x0, #16]
	ldr	x0, [x29, #40]
	ldr	x1, [x29, #64]
	# Square * 2
	ldp	x12, x13, [x1]
	ldp	x14, x15, [x1, #16]
	#  A[0] * A[1]
	mul	x5, x12, x13
	umulh	x6, x12, x13
	#  A[0] * A[2]
	mul	x25, x12, x14
	umulh	x7, x12, x14
	adds	x6, x6, x25
	adc	x7, x7, xzr
	#  A[0] * A[3]
	mul	x25, x12, x15
	umulh	x8, x12, x15
	adds	x7, x7, x25
	adc	x8, x8, xzr
	#  A[1] * A[2]
	mul	x25, x13, x14
	umulh	x26, x13, x14
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, xzr, xzr
	#  A[1] * A[3]
	mul	x25, x13, x15
	umulh	x26, x13, x15
	adds	x8, x8, x25
	adc	x9, x9, x26
	#  A[2] * A[3]
	mul	x25, x14, x15
	umulh	x10, x14, x15
	adds	x9, x9, x25
	adc	x10, x10, xzr
	# Double
	adds	x5, x5, x5
	adcs	x6, x6, x6
	adcs	x7, x7, x7
	adcs	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x10, x10, x10
	adc	x11, xzr, xzr
	#  A[0] * A[0]
	mul	x4, x12, x12
	umulh	x28, x12, x12
	#  A[1] * A[1]
	mul	x25, x13, x13
	umulh	x26, x13, x13
	adds	x5, x5, x28
	adcs	x6, x6, x25
	adc	x28, x26, xzr
	#  A[2] * A[2]
	mul	x25, x14, x14
	umulh	x26, x14, x14
	adds	x7, x7, x28
	adcs	x8, x8, x25
	adc	x28, x26, xzr
	#  A[3] * A[3]
	mul	x25, x15, x15
	umulh	x26, x15, x15
	adds	x9, x9, x28
	adcs	x10, x10, x25
	adc	x11, x11, x26
	# Double and Reduce
	mov	x25, #0x169
	#  Move top half into t4-t7 and remove top bit from t3
	lsr	x28, x11, #61
	extr	x11, x11, x10, #62
	extr	x10, x10, x9, #62
	extr	x9, x9, x8, #62
	extr	x8, x8, x7, #62
	extr	x7, x7, x6, #63
	extr	x6, x6, x5, #63
	extr	x5, x5, x4, #63
	lsl	x4, x4, #1
	and	x7, x7, #0x7fffffffffffffff
	#  Two left, only one right
	and	x11, x11, #0x7fffffffffffffff
	#  Multiply top bits by 19*19
	mul	x28, x28, x25
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x8
	umulh	x8, x25, x8
	adds	x4, x4, x26
	mul	x26, x25, x9
	umulh	x9, x25, x9
	adcs	x5, x5, x26
	mul	x26, x25, x10
	umulh	x10, x25, x10
	adcs	x6, x6, x26
	mul	x26, x25, x11
	umulh	x27, x25, x11
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x4, x4, x28
	adcs	x5, x5, x8
	adcs	x6, x6, x9
	adcs	x7, x7, x10
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #40]
	# Sub
	subs	x4, x4, x21
	sbcs	x5, x5, x22
	sbcs	x6, x6, x23
	sbcs	x7, x7, x24
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x4, x4, x25
	adcs	x5, x5, x28
	adcs	x6, x6, x28
	adc	x7, x7, x26
	stp	x4, x5, [x0]
	stp	x6, x7, [x0, #16]
	ldr	x17, [x29, #88]
	ldr	x19, [x29, #96]
	ldp	x20, x21, [x29, #104]
	ldp	x22, x23, [x29, #120]
	ldp	x24, x25, [x29, #136]
	ldp	x26, x27, [x29, #152]
	ldr	x28, [x29, #168]
	ldp	x29, x30, [sp], #0xb0
	ret
	.size	fe_ge_dbl,.-fe_ge_dbl
	.text
	.align	2
	.globl	fe_ge_madd
	.type	fe_ge_madd, %function
fe_ge_madd:
	stp	x29, x30, [sp, #-176]!
	add	x29, sp, #0
	str	x17, [x29, #88]
	str	x19, [x29, #96]
	stp	x20, x21, [x29, #104]
	stp	x22, x23, [x29, #120]
	stp	x24, x25, [x29, #136]
	stp	x26, x27, [x29, #152]
	str	x28, [x29, #168]
	str	x0, [x29, #16]
	str	x1, [x29, #24]
	str	x2, [x29, #32]
	str	x3, [x29, #40]
	str	x4, [x29, #48]
	str	x5, [x29, #56]
	str	x6, [x29, #64]
	str	x7, [x29, #72]
	ldr	x2, [x29, #56]
	ldr	x3, [x29, #48]
	# Add
	ldp	x12, x13, [x2]
	ldp	x14, x15, [x2, #16]
	ldp	x16, x17, [x3]
	ldp	x19, x20, [x3, #16]
	adds	x4, x12, x16
	adcs	x5, x13, x17
	adcs	x6, x14, x19
	adc	x7, x15, x20
	mov	x25, #-19
	asr	x28, x7, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x4, x4, x25
	sbcs	x5, x5, x28
	sbcs	x6, x6, x28
	sbc	x7, x7, x26
	# Sub
	subs	x8, x12, x16
	sbcs	x9, x13, x17
	sbcs	x10, x14, x19
	sbcs	x11, x15, x20
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x8, x8, x25
	adcs	x9, x9, x28
	adcs	x10, x10, x28
	adc	x11, x11, x26
	ldr	x0, [x29, #32]
	ldr	x2, [x29, #184]
	# Multiply
	ldp	x21, x22, [x2]
	ldp	x23, x24, [x2, #16]
	#  A[0] * B[0]
	mul	x12, x4, x21
	umulh	x13, x4, x21
	#  A[0] * B[1]
	mul	x25, x4, x22
	umulh	x14, x4, x22
	adds	x13, x13, x25
	adc	x14, x14, xzr
	#  A[1] * B[0]
	mul	x25, x5, x21
	umulh	x26, x5, x21
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x4, x23
	umulh	x26, x4, x23
	adds	x14, x14, x25
	adc	x15, x15, x26
	#  A[1] * B[1]
	mul	x25, x5, x22
	umulh	x26, x5, x22
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x6, x21
	umulh	x26, x6, x21
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x4, x24
	umulh	x26, x4, x24
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x5, x23
	umulh	x26, x5, x23
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x6, x22
	umulh	x26, x6, x22
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x7, x21
	umulh	x26, x7, x21
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x5, x24
	umulh	x26, x5, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x6, x23
	umulh	x26, x6, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x7, x22
	umulh	x26, x7, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x6, x24
	umulh	x26, x6, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x7, x23
	umulh	x26, x7, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x7, x24
	umulh	x26, x7, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x15, #63
	and	x15, x15, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x12, x12, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x13, x13, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x14, x14, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x15, x15, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x13, x13, x16
	adcs	x14, x14, x17
	adcs	x15, x15, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x15, #63
	mul	x27, x27, x25
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Reduce if top bit set
	and	x27, x25, x15, asr 63
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #192]
	# Multiply
	ldp	x21, x22, [x1]
	ldp	x23, x24, [x1, #16]
	#  A[0] * B[0]
	mul	x4, x8, x21
	umulh	x5, x8, x21
	#  A[0] * B[1]
	mul	x25, x8, x22
	umulh	x6, x8, x22
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x9, x21
	umulh	x26, x9, x21
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x8, x23
	umulh	x26, x8, x23
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x9, x22
	umulh	x26, x9, x22
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x10, x21
	umulh	x26, x10, x21
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x8, x24
	umulh	x26, x8, x24
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x9, x23
	umulh	x26, x9, x23
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x10, x22
	umulh	x26, x10, x22
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x11, x21
	umulh	x26, x11, x21
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x9, x24
	umulh	x26, x9, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x10, x23
	umulh	x26, x10, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x11, x22
	umulh	x26, x11, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x10, x24
	umulh	x26, x10, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x11, x23
	umulh	x26, x11, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x11, x24
	umulh	x26, x11, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x4, x4, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x5, x5, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x6, x6, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x16
	adcs	x6, x6, x17
	adcs	x7, x7, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #16]
	# Add
	adds	x8, x12, x4
	adcs	x9, x13, x5
	adcs	x10, x14, x6
	adc	x11, x15, x7
	mov	x25, #-19
	asr	x28, x11, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x8, x8, x25
	sbcs	x9, x9, x28
	sbcs	x10, x10, x28
	sbc	x11, x11, x26
	# Sub
	subs	x16, x12, x4
	sbcs	x17, x13, x5
	sbcs	x19, x14, x6
	sbcs	x20, x15, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x8, x9, [x0]
	stp	x10, x11, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x0, [x29, #40]
	ldr	x1, [x29, #176]
	ldr	x3, [x29, #72]
	# Multiply
	ldp	x16, x17, [x1]
	ldp	x19, x20, [x1, #16]
	ldp	x21, x22, [x3]
	ldp	x23, x24, [x3, #16]
	#  A[0] * B[0]
	mul	x4, x16, x21
	umulh	x5, x16, x21
	#  A[0] * B[1]
	mul	x25, x16, x22
	umulh	x6, x16, x22
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x17, x21
	umulh	x26, x17, x21
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x16, x23
	umulh	x26, x16, x23
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x17, x22
	umulh	x26, x17, x22
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x19, x21
	umulh	x26, x19, x21
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, x8, xzr
	#  A[0] * B[3]
	mul	x25, x16, x24
	umulh	x26, x16, x24
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x17, x23
	umulh	x26, x17, x23
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[2] * B[1]
	mul	x25, x19, x22
	umulh	x26, x19, x22
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[3] * B[0]
	mul	x25, x20, x21
	umulh	x26, x20, x21
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[1] * B[3]
	mul	x25, x17, x24
	umulh	x26, x17, x24
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x19, x23
	umulh	x26, x19, x23
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[3] * B[1]
	mul	x25, x20, x22
	umulh	x26, x20, x22
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[2] * B[3]
	mul	x25, x19, x24
	umulh	x26, x19, x24
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x20, x23
	umulh	x26, x20, x23
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, x11, xzr
	#  A[3] * B[3]
	mul	x25, x20, x24
	umulh	x26, x20, x24
	adds	x10, x10, x25
	adc	x11, x11, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x8
	umulh	x8, x25, x8
	adds	x4, x4, x26
	mul	x26, x25, x9
	umulh	x9, x25, x9
	adcs	x5, x5, x26
	mul	x26, x25, x10
	umulh	x10, x25, x10
	adcs	x6, x6, x26
	mul	x26, x25, x11
	umulh	x27, x25, x11
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x8
	adcs	x6, x6, x9
	adcs	x7, x7, x10
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #32]
	ldr	x1, [x29, #64]
	# Double
	ldp	x8, x9, [x1]
	ldp	x10, x11, [x1, #16]
	adds	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x10, x10, x10
	adc	x11, x11, x11
	mov	x25, #-19
	asr	x28, x11, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x8, x8, x25
	sbcs	x9, x9, x28
	sbcs	x10, x10, x28
	sbc	x11, x11, x26
	ldr	x1, [x29, #40]
	# Add
	adds	x12, x8, x4
	adcs	x13, x9, x5
	adcs	x14, x10, x6
	adc	x15, x11, x7
	mov	x25, #-19
	asr	x28, x15, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x12, x12, x25
	sbcs	x13, x13, x28
	sbcs	x14, x14, x28
	sbc	x15, x15, x26
	# Sub
	subs	x16, x8, x4
	sbcs	x17, x9, x5
	sbcs	x19, x10, x6
	sbcs	x20, x11, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x12, x13, [x0]
	stp	x14, x15, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x17, [x29, #88]
	ldr	x19, [x29, #96]
	ldp	x20, x21, [x29, #104]
	ldp	x22, x23, [x29, #120]
	ldp	x24, x25, [x29, #136]
	ldp	x26, x27, [x29, #152]
	ldr	x28, [x29, #168]
	ldp	x29, x30, [sp], #0xb0
	ret
	.size	fe_ge_madd,.-fe_ge_madd
	.text
	.align	2
	.globl	fe_ge_msub
	.type	fe_ge_msub, %function
fe_ge_msub:
	stp	x29, x30, [sp, #-176]!
	add	x29, sp, #0
	str	x17, [x29, #88]
	str	x19, [x29, #96]
	stp	x20, x21, [x29, #104]
	stp	x22, x23, [x29, #120]
	stp	x24, x25, [x29, #136]
	stp	x26, x27, [x29, #152]
	str	x28, [x29, #168]
	str	x0, [x29, #16]
	str	x1, [x29, #24]
	str	x2, [x29, #32]
	str	x3, [x29, #40]
	str	x4, [x29, #48]
	str	x5, [x29, #56]
	str	x6, [x29, #64]
	str	x7, [x29, #72]
	ldr	x2, [x29, #56]
	ldr	x3, [x29, #48]
	# Add
	ldp	x12, x13, [x2]
	ldp	x14, x15, [x2, #16]
	ldp	x16, x17, [x3]
	ldp	x19, x20, [x3, #16]
	adds	x4, x12, x16
	adcs	x5, x13, x17
	adcs	x6, x14, x19
	adc	x7, x15, x20
	mov	x25, #-19
	asr	x28, x7, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x4, x4, x25
	sbcs	x5, x5, x28
	sbcs	x6, x6, x28
	sbc	x7, x7, x26
	# Sub
	subs	x8, x12, x16
	sbcs	x9, x13, x17
	sbcs	x10, x14, x19
	sbcs	x11, x15, x20
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x8, x8, x25
	adcs	x9, x9, x28
	adcs	x10, x10, x28
	adc	x11, x11, x26
	ldr	x0, [x29, #32]
	ldr	x2, [x29, #192]
	# Multiply
	ldp	x21, x22, [x2]
	ldp	x23, x24, [x2, #16]
	#  A[0] * B[0]
	mul	x12, x4, x21
	umulh	x13, x4, x21
	#  A[0] * B[1]
	mul	x25, x4, x22
	umulh	x14, x4, x22
	adds	x13, x13, x25
	adc	x14, x14, xzr
	#  A[1] * B[0]
	mul	x25, x5, x21
	umulh	x26, x5, x21
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x4, x23
	umulh	x26, x4, x23
	adds	x14, x14, x25
	adc	x15, x15, x26
	#  A[1] * B[1]
	mul	x25, x5, x22
	umulh	x26, x5, x22
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x6, x21
	umulh	x26, x6, x21
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x4, x24
	umulh	x26, x4, x24
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x5, x23
	umulh	x26, x5, x23
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x6, x22
	umulh	x26, x6, x22
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x7, x21
	umulh	x26, x7, x21
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x5, x24
	umulh	x26, x5, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x6, x23
	umulh	x26, x6, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x7, x22
	umulh	x26, x7, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x6, x24
	umulh	x26, x6, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x7, x23
	umulh	x26, x7, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x7, x24
	umulh	x26, x7, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x15, #63
	and	x15, x15, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x12, x12, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x13, x13, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x14, x14, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x15, x15, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x13, x13, x16
	adcs	x14, x14, x17
	adcs	x15, x15, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x15, #63
	mul	x27, x27, x25
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Reduce if top bit set
	and	x27, x25, x15, asr 63
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #184]
	# Multiply
	ldp	x21, x22, [x1]
	ldp	x23, x24, [x1, #16]
	#  A[0] * B[0]
	mul	x4, x8, x21
	umulh	x5, x8, x21
	#  A[0] * B[1]
	mul	x25, x8, x22
	umulh	x6, x8, x22
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x9, x21
	umulh	x26, x9, x21
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x8, x23
	umulh	x26, x8, x23
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x9, x22
	umulh	x26, x9, x22
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x10, x21
	umulh	x26, x10, x21
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x8, x24
	umulh	x26, x8, x24
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x9, x23
	umulh	x26, x9, x23
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x10, x22
	umulh	x26, x10, x22
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x11, x21
	umulh	x26, x11, x21
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x9, x24
	umulh	x26, x9, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x10, x23
	umulh	x26, x10, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x11, x22
	umulh	x26, x11, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x10, x24
	umulh	x26, x10, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x11, x23
	umulh	x26, x11, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x11, x24
	umulh	x26, x11, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x4, x4, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x5, x5, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x6, x6, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x16
	adcs	x6, x6, x17
	adcs	x7, x7, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #16]
	# Add
	adds	x8, x12, x4
	adcs	x9, x13, x5
	adcs	x10, x14, x6
	adc	x11, x15, x7
	mov	x25, #-19
	asr	x28, x11, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x8, x8, x25
	sbcs	x9, x9, x28
	sbcs	x10, x10, x28
	sbc	x11, x11, x26
	# Sub
	subs	x16, x12, x4
	sbcs	x17, x13, x5
	sbcs	x19, x14, x6
	sbcs	x20, x15, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x8, x9, [x0]
	stp	x10, x11, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x0, [x29, #40]
	ldr	x1, [x29, #176]
	ldr	x3, [x29, #72]
	# Multiply
	ldp	x16, x17, [x1]
	ldp	x19, x20, [x1, #16]
	ldp	x21, x22, [x3]
	ldp	x23, x24, [x3, #16]
	#  A[0] * B[0]
	mul	x4, x16, x21
	umulh	x5, x16, x21
	#  A[0] * B[1]
	mul	x25, x16, x22
	umulh	x6, x16, x22
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x17, x21
	umulh	x26, x17, x21
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x16, x23
	umulh	x26, x16, x23
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x17, x22
	umulh	x26, x17, x22
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x19, x21
	umulh	x26, x19, x21
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, x8, xzr
	#  A[0] * B[3]
	mul	x25, x16, x24
	umulh	x26, x16, x24
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x17, x23
	umulh	x26, x17, x23
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[2] * B[1]
	mul	x25, x19, x22
	umulh	x26, x19, x22
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[3] * B[0]
	mul	x25, x20, x21
	umulh	x26, x20, x21
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[1] * B[3]
	mul	x25, x17, x24
	umulh	x26, x17, x24
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x19, x23
	umulh	x26, x19, x23
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[3] * B[1]
	mul	x25, x20, x22
	umulh	x26, x20, x22
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[2] * B[3]
	mul	x25, x19, x24
	umulh	x26, x19, x24
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x20, x23
	umulh	x26, x20, x23
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, x11, xzr
	#  A[3] * B[3]
	mul	x25, x20, x24
	umulh	x26, x20, x24
	adds	x10, x10, x25
	adc	x11, x11, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x8
	umulh	x8, x25, x8
	adds	x4, x4, x26
	mul	x26, x25, x9
	umulh	x9, x25, x9
	adcs	x5, x5, x26
	mul	x26, x25, x10
	umulh	x10, x25, x10
	adcs	x6, x6, x26
	mul	x26, x25, x11
	umulh	x27, x25, x11
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x8
	adcs	x6, x6, x9
	adcs	x7, x7, x10
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #32]
	ldr	x1, [x29, #64]
	# Double
	ldp	x8, x9, [x1]
	ldp	x10, x11, [x1, #16]
	adds	x8, x8, x8
	adcs	x9, x9, x9
	adcs	x10, x10, x10
	adc	x11, x11, x11
	mov	x25, #-19
	asr	x28, x11, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x8, x8, x25
	sbcs	x9, x9, x28
	sbcs	x10, x10, x28
	sbc	x11, x11, x26
	ldr	x1, [x29, #40]
	# Add
	adds	x12, x8, x4
	adcs	x13, x9, x5
	adcs	x14, x10, x6
	adc	x15, x11, x7
	mov	x25, #-19
	asr	x28, x15, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x12, x12, x25
	sbcs	x13, x13, x28
	sbcs	x14, x14, x28
	sbc	x15, x15, x26
	# Sub
	subs	x16, x8, x4
	sbcs	x17, x9, x5
	sbcs	x19, x10, x6
	sbcs	x20, x11, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x12, x13, [x1]
	stp	x14, x15, [x1, #16]
	stp	x16, x17, [x0]
	stp	x19, x20, [x0, #16]
	ldr	x17, [x29, #88]
	ldr	x19, [x29, #96]
	ldp	x20, x21, [x29, #104]
	ldp	x22, x23, [x29, #120]
	ldp	x24, x25, [x29, #136]
	ldp	x26, x27, [x29, #152]
	ldr	x28, [x29, #168]
	ldp	x29, x30, [sp], #0xb0
	ret
	.size	fe_ge_msub,.-fe_ge_msub
	.text
	.align	2
	.globl	fe_ge_add
	.type	fe_ge_add, %function
fe_ge_add:
	stp	x29, x30, [sp, #-176]!
	add	x29, sp, #0
	str	x17, [x29, #88]
	str	x19, [x29, #96]
	stp	x20, x21, [x29, #104]
	stp	x22, x23, [x29, #120]
	stp	x24, x25, [x29, #136]
	stp	x26, x27, [x29, #152]
	str	x28, [x29, #168]
	str	x0, [x29, #16]
	str	x1, [x29, #24]
	str	x2, [x29, #32]
	str	x3, [x29, #40]
	str	x4, [x29, #48]
	str	x5, [x29, #56]
	str	x6, [x29, #64]
	str	x7, [x29, #72]
	ldr	x2, [x29, #56]
	ldr	x3, [x29, #48]
	# Add
	ldp	x12, x13, [x2]
	ldp	x14, x15, [x2, #16]
	ldp	x16, x17, [x3]
	ldp	x19, x20, [x3, #16]
	adds	x4, x12, x16
	adcs	x5, x13, x17
	adcs	x6, x14, x19
	adc	x7, x15, x20
	mov	x25, #-19
	asr	x28, x7, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x4, x4, x25
	sbcs	x5, x5, x28
	sbcs	x6, x6, x28
	sbc	x7, x7, x26
	# Sub
	subs	x8, x12, x16
	sbcs	x9, x13, x17
	sbcs	x10, x14, x19
	sbcs	x11, x15, x20
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x8, x8, x25
	adcs	x9, x9, x28
	adcs	x10, x10, x28
	adc	x11, x11, x26
	ldr	x0, [x29, #32]
	ldr	x2, [x29, #192]
	# Multiply
	ldp	x21, x22, [x2]
	ldp	x23, x24, [x2, #16]
	#  A[0] * B[0]
	mul	x12, x4, x21
	umulh	x13, x4, x21
	#  A[0] * B[1]
	mul	x25, x4, x22
	umulh	x14, x4, x22
	adds	x13, x13, x25
	adc	x14, x14, xzr
	#  A[1] * B[0]
	mul	x25, x5, x21
	umulh	x26, x5, x21
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x4, x23
	umulh	x26, x4, x23
	adds	x14, x14, x25
	adc	x15, x15, x26
	#  A[1] * B[1]
	mul	x25, x5, x22
	umulh	x26, x5, x22
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x6, x21
	umulh	x26, x6, x21
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x4, x24
	umulh	x26, x4, x24
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x5, x23
	umulh	x26, x5, x23
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x6, x22
	umulh	x26, x6, x22
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x7, x21
	umulh	x26, x7, x21
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x5, x24
	umulh	x26, x5, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x6, x23
	umulh	x26, x6, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x7, x22
	umulh	x26, x7, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x6, x24
	umulh	x26, x6, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x7, x23
	umulh	x26, x7, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x7, x24
	umulh	x26, x7, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x15, #63
	and	x15, x15, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x12, x12, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x13, x13, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x14, x14, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x15, x15, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x13, x13, x16
	adcs	x14, x14, x17
	adcs	x15, x15, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x15, #63
	mul	x27, x27, x25
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Reduce if top bit set
	and	x27, x25, x15, asr 63
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #200]
	# Multiply
	ldp	x21, x22, [x1]
	ldp	x23, x24, [x1, #16]
	#  A[0] * B[0]
	mul	x4, x8, x21
	umulh	x5, x8, x21
	#  A[0] * B[1]
	mul	x25, x8, x22
	umulh	x6, x8, x22
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x9, x21
	umulh	x26, x9, x21
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x8, x23
	umulh	x26, x8, x23
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x9, x22
	umulh	x26, x9, x22
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x10, x21
	umulh	x26, x10, x21
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x8, x24
	umulh	x26, x8, x24
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x9, x23
	umulh	x26, x9, x23
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x10, x22
	umulh	x26, x10, x22
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x11, x21
	umulh	x26, x11, x21
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x9, x24
	umulh	x26, x9, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x10, x23
	umulh	x26, x10, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x11, x22
	umulh	x26, x11, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x10, x24
	umulh	x26, x10, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x11, x23
	umulh	x26, x11, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x11, x24
	umulh	x26, x11, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x4, x4, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x5, x5, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x6, x6, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x16
	adcs	x6, x6, x17
	adcs	x7, x7, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #16]
	# Add
	adds	x8, x12, x4
	adcs	x9, x13, x5
	adcs	x10, x14, x6
	adc	x11, x15, x7
	mov	x25, #-19
	asr	x28, x11, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x8, x8, x25
	sbcs	x9, x9, x28
	sbcs	x10, x10, x28
	sbc	x11, x11, x26
	# Sub
	subs	x16, x12, x4
	sbcs	x17, x13, x5
	sbcs	x19, x14, x6
	sbcs	x20, x15, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x8, x9, [x0]
	stp	x10, x11, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x0, [x29, #48]
	ldr	x1, [x29, #64]
	ldr	x2, [x29, #176]
	# Multiply
	ldp	x12, x13, [x1]
	ldp	x14, x15, [x1, #16]
	ldp	x16, x17, [x2]
	ldp	x19, x20, [x2, #16]
	#  A[0] * B[0]
	mul	x4, x12, x16
	umulh	x5, x12, x16
	#  A[0] * B[1]
	mul	x25, x12, x17
	umulh	x6, x12, x17
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x13, x16
	umulh	x26, x13, x16
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x12, x19
	umulh	x26, x12, x19
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x13, x17
	umulh	x26, x13, x17
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x14, x16
	umulh	x26, x14, x16
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, x8, xzr
	#  A[0] * B[3]
	mul	x25, x12, x20
	umulh	x26, x12, x20
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x13, x19
	umulh	x26, x13, x19
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[2] * B[1]
	mul	x25, x14, x17
	umulh	x26, x14, x17
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[3] * B[0]
	mul	x25, x15, x16
	umulh	x26, x15, x16
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[1] * B[3]
	mul	x25, x13, x20
	umulh	x26, x13, x20
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x14, x19
	umulh	x26, x14, x19
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[3] * B[1]
	mul	x25, x15, x17
	umulh	x26, x15, x17
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[2] * B[3]
	mul	x25, x14, x20
	umulh	x26, x14, x20
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x15, x19
	umulh	x26, x15, x19
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, x11, xzr
	#  A[3] * B[3]
	mul	x25, x15, x20
	umulh	x26, x15, x20
	adds	x10, x10, x25
	adc	x11, x11, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x8
	umulh	x8, x25, x8
	adds	x4, x4, x26
	mul	x26, x25, x9
	umulh	x9, x25, x9
	adcs	x5, x5, x26
	mul	x26, x25, x10
	umulh	x10, x25, x10
	adcs	x6, x6, x26
	mul	x26, x25, x11
	umulh	x27, x25, x11
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x8
	adcs	x6, x6, x9
	adcs	x7, x7, x10
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #48]
	# Double
	adds	x4, x4, x4
	adcs	x5, x5, x5
	adcs	x6, x6, x6
	adc	x7, x7, x7
	mov	x25, #-19
	asr	x28, x7, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x4, x4, x25
	sbcs	x5, x5, x28
	sbcs	x6, x6, x28
	sbc	x7, x7, x26
	ldr	x0, [x29, #40]
	ldr	x1, [x29, #184]
	ldr	x2, [x29, #72]
	# Multiply
	ldp	x16, x17, [x1]
	ldp	x19, x20, [x1, #16]
	ldp	x21, x22, [x2]
	ldp	x23, x24, [x2, #16]
	#  A[0] * B[0]
	mul	x8, x16, x21
	umulh	x9, x16, x21
	#  A[0] * B[1]
	mul	x25, x16, x22
	umulh	x10, x16, x22
	adds	x9, x9, x25
	adc	x10, x10, xzr
	#  A[1] * B[0]
	mul	x25, x17, x21
	umulh	x26, x17, x21
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x16, x23
	umulh	x26, x16, x23
	adds	x10, x10, x25
	adc	x11, x11, x26
	#  A[1] * B[1]
	mul	x25, x17, x22
	umulh	x26, x17, x22
	adds	x10, x10, x25
	adcs	x11, x11, x26
	adc	x12, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x19, x21
	umulh	x26, x19, x21
	adds	x10, x10, x25
	adcs	x11, x11, x26
	adc	x12, x12, xzr
	#  A[0] * B[3]
	mul	x25, x16, x24
	umulh	x26, x16, x24
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x17, x23
	umulh	x26, x17, x23
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, x13, xzr
	#  A[2] * B[1]
	mul	x25, x19, x22
	umulh	x26, x19, x22
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, x13, xzr
	#  A[3] * B[0]
	mul	x25, x20, x21
	umulh	x26, x20, x21
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, x13, xzr
	#  A[1] * B[3]
	mul	x25, x17, x24
	umulh	x26, x17, x24
	adds	x12, x12, x25
	adcs	x13, x13, x26
	adc	x14, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x19, x23
	umulh	x26, x19, x23
	adds	x12, x12, x25
	adcs	x13, x13, x26
	adc	x14, x14, xzr
	#  A[3] * B[1]
	mul	x25, x20, x22
	umulh	x26, x20, x22
	adds	x12, x12, x25
	adcs	x13, x13, x26
	adc	x14, x14, xzr
	#  A[2] * B[3]
	mul	x25, x19, x24
	umulh	x26, x19, x24
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x20, x23
	umulh	x26, x20, x23
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, x15, xzr
	#  A[3] * B[3]
	mul	x25, x20, x24
	umulh	x26, x20, x24
	adds	x14, x14, x25
	adc	x15, x15, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x15, x15, x14, #63
	extr	x14, x14, x13, #63
	extr	x13, x13, x12, #63
	extr	x12, x12, x11, #63
	and	x11, x11, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x12
	umulh	x12, x25, x12
	adds	x8, x8, x26
	mul	x26, x25, x13
	umulh	x13, x25, x13
	adcs	x9, x9, x26
	mul	x26, x25, x14
	umulh	x14, x25, x14
	adcs	x10, x10, x26
	mul	x26, x25, x15
	umulh	x27, x25, x15
	adcs	x11, x11, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x9, x9, x12
	adcs	x10, x10, x13
	adcs	x11, x11, x14
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x11, #63
	mul	x27, x27, x25
	and	x11, x11, #0x7fffffffffffffff
	adds	x8, x8, x27
	adcs	x9, x9, xzr
	adcs	x10, x10, xzr
	adc	x11, x11, xzr
	# Reduce if top bit set
	and	x27, x25, x11, asr 63
	and	x11, x11, #0x7fffffffffffffff
	adds	x8, x8, x27
	adcs	x9, x9, xzr
	adcs	x10, x10, xzr
	adc	x11, x11, xzr
	# Store
	ldr	x0, [x29, #32]
	ldr	x1, [x29, #40]
	# Add
	adds	x12, x4, x8
	adcs	x13, x5, x9
	adcs	x14, x6, x10
	adc	x15, x7, x11
	mov	x25, #-19
	asr	x28, x15, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x12, x12, x25
	sbcs	x13, x13, x28
	sbcs	x14, x14, x28
	sbc	x15, x15, x26
	# Sub
	subs	x16, x4, x8
	sbcs	x17, x5, x9
	sbcs	x19, x6, x10
	sbcs	x20, x7, x11
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x12, x13, [x0]
	stp	x14, x15, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x17, [x29, #88]
	ldr	x19, [x29, #96]
	ldp	x20, x21, [x29, #104]
	ldp	x22, x23, [x29, #120]
	ldp	x24, x25, [x29, #136]
	ldp	x26, x27, [x29, #152]
	ldr	x28, [x29, #168]
	ldp	x29, x30, [sp], #0xb0
	ret
	.size	fe_ge_add,.-fe_ge_add
	.text
	.align	2
	.globl	fe_ge_sub
	.type	fe_ge_sub, %function
fe_ge_sub:
	stp	x29, x30, [sp, #-176]!
	add	x29, sp, #0
	str	x17, [x29, #88]
	str	x19, [x29, #96]
	stp	x20, x21, [x29, #104]
	stp	x22, x23, [x29, #120]
	stp	x24, x25, [x29, #136]
	stp	x26, x27, [x29, #152]
	str	x28, [x29, #168]
	str	x0, [x29, #16]
	str	x1, [x29, #24]
	str	x2, [x29, #32]
	str	x3, [x29, #40]
	str	x4, [x29, #48]
	str	x5, [x29, #56]
	str	x6, [x29, #64]
	str	x7, [x29, #72]
	ldr	x2, [x29, #56]
	ldr	x3, [x29, #48]
	# Add
	ldp	x12, x13, [x2]
	ldp	x14, x15, [x2, #16]
	ldp	x16, x17, [x3]
	ldp	x19, x20, [x3, #16]
	adds	x4, x12, x16
	adcs	x5, x13, x17
	adcs	x6, x14, x19
	adc	x7, x15, x20
	mov	x25, #-19
	asr	x28, x7, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x4, x4, x25
	sbcs	x5, x5, x28
	sbcs	x6, x6, x28
	sbc	x7, x7, x26
	# Sub
	subs	x8, x12, x16
	sbcs	x9, x13, x17
	sbcs	x10, x14, x19
	sbcs	x11, x15, x20
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x8, x8, x25
	adcs	x9, x9, x28
	adcs	x10, x10, x28
	adc	x11, x11, x26
	ldr	x0, [x29, #32]
	ldr	x2, [x29, #200]
	# Multiply
	ldp	x21, x22, [x2]
	ldp	x23, x24, [x2, #16]
	#  A[0] * B[0]
	mul	x12, x4, x21
	umulh	x13, x4, x21
	#  A[0] * B[1]
	mul	x25, x4, x22
	umulh	x14, x4, x22
	adds	x13, x13, x25
	adc	x14, x14, xzr
	#  A[1] * B[0]
	mul	x25, x5, x21
	umulh	x26, x5, x21
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x4, x23
	umulh	x26, x4, x23
	adds	x14, x14, x25
	adc	x15, x15, x26
	#  A[1] * B[1]
	mul	x25, x5, x22
	umulh	x26, x5, x22
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x6, x21
	umulh	x26, x6, x21
	adds	x14, x14, x25
	adcs	x15, x15, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x4, x24
	umulh	x26, x4, x24
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x5, x23
	umulh	x26, x5, x23
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x6, x22
	umulh	x26, x6, x22
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x7, x21
	umulh	x26, x7, x21
	adds	x15, x15, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x5, x24
	umulh	x26, x5, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x6, x23
	umulh	x26, x6, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x7, x22
	umulh	x26, x7, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x6, x24
	umulh	x26, x6, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x7, x23
	umulh	x26, x7, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x7, x24
	umulh	x26, x7, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x15, #63
	and	x15, x15, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x12, x12, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x13, x13, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x14, x14, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x15, x15, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x13, x13, x16
	adcs	x14, x14, x17
	adcs	x15, x15, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x15, #63
	mul	x27, x27, x25
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Reduce if top bit set
	and	x27, x25, x15, asr 63
	and	x15, x15, #0x7fffffffffffffff
	adds	x12, x12, x27
	adcs	x13, x13, xzr
	adcs	x14, x14, xzr
	adc	x15, x15, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #192]
	# Multiply
	ldp	x21, x22, [x1]
	ldp	x23, x24, [x1, #16]
	#  A[0] * B[0]
	mul	x4, x8, x21
	umulh	x5, x8, x21
	#  A[0] * B[1]
	mul	x25, x8, x22
	umulh	x6, x8, x22
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x9, x21
	umulh	x26, x9, x21
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x8, x23
	umulh	x26, x8, x23
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x9, x22
	umulh	x26, x9, x22
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x10, x21
	umulh	x26, x10, x21
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x16, x16, xzr
	#  A[0] * B[3]
	mul	x25, x8, x24
	umulh	x26, x8, x24
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x9, x23
	umulh	x26, x9, x23
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[2] * B[1]
	mul	x25, x10, x22
	umulh	x26, x10, x22
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[3] * B[0]
	mul	x25, x11, x21
	umulh	x26, x11, x21
	adds	x7, x7, x25
	adcs	x16, x16, x26
	adc	x17, x17, xzr
	#  A[1] * B[3]
	mul	x25, x9, x24
	umulh	x26, x9, x24
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x10, x23
	umulh	x26, x10, x23
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[3] * B[1]
	mul	x25, x11, x22
	umulh	x26, x11, x22
	adds	x16, x16, x25
	adcs	x17, x17, x26
	adc	x19, x19, xzr
	#  A[2] * B[3]
	mul	x25, x10, x24
	umulh	x26, x10, x24
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x11, x23
	umulh	x26, x11, x23
	adds	x17, x17, x25
	adcs	x19, x19, x26
	adc	x20, x20, xzr
	#  A[3] * B[3]
	mul	x25, x11, x24
	umulh	x26, x11, x24
	adds	x19, x19, x25
	adc	x20, x20, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x20, x20, x19, #63
	extr	x19, x19, x17, #63
	extr	x17, x17, x16, #63
	extr	x16, x16, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x16
	umulh	x16, x25, x16
	adds	x4, x4, x26
	mul	x26, x25, x17
	umulh	x17, x25, x17
	adcs	x5, x5, x26
	mul	x26, x25, x19
	umulh	x19, x25, x19
	adcs	x6, x6, x26
	mul	x26, x25, x20
	umulh	x27, x25, x20
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x16
	adcs	x6, x6, x17
	adcs	x7, x7, x19
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #24]
	ldr	x1, [x29, #16]
	# Add
	adds	x8, x12, x4
	adcs	x9, x13, x5
	adcs	x10, x14, x6
	adc	x11, x15, x7
	mov	x25, #-19
	asr	x28, x11, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x8, x8, x25
	sbcs	x9, x9, x28
	sbcs	x10, x10, x28
	sbc	x11, x11, x26
	# Sub
	subs	x16, x12, x4
	sbcs	x17, x13, x5
	sbcs	x19, x14, x6
	sbcs	x20, x15, x7
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x8, x9, [x0]
	stp	x10, x11, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x0, [x29, #48]
	ldr	x1, [x29, #64]
	ldr	x2, [x29, #176]
	# Multiply
	ldp	x12, x13, [x1]
	ldp	x14, x15, [x1, #16]
	ldp	x16, x17, [x2]
	ldp	x19, x20, [x2, #16]
	#  A[0] * B[0]
	mul	x4, x12, x16
	umulh	x5, x12, x16
	#  A[0] * B[1]
	mul	x25, x12, x17
	umulh	x6, x12, x17
	adds	x5, x5, x25
	adc	x6, x6, xzr
	#  A[1] * B[0]
	mul	x25, x13, x16
	umulh	x26, x13, x16
	adds	x5, x5, x25
	adcs	x6, x6, x26
	adc	x7, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x12, x19
	umulh	x26, x12, x19
	adds	x6, x6, x25
	adc	x7, x7, x26
	#  A[1] * B[1]
	mul	x25, x13, x17
	umulh	x26, x13, x17
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x14, x16
	umulh	x26, x14, x16
	adds	x6, x6, x25
	adcs	x7, x7, x26
	adc	x8, x8, xzr
	#  A[0] * B[3]
	mul	x25, x12, x20
	umulh	x26, x12, x20
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x13, x19
	umulh	x26, x13, x19
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[2] * B[1]
	mul	x25, x14, x17
	umulh	x26, x14, x17
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[3] * B[0]
	mul	x25, x15, x16
	umulh	x26, x15, x16
	adds	x7, x7, x25
	adcs	x8, x8, x26
	adc	x9, x9, xzr
	#  A[1] * B[3]
	mul	x25, x13, x20
	umulh	x26, x13, x20
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x14, x19
	umulh	x26, x14, x19
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[3] * B[1]
	mul	x25, x15, x17
	umulh	x26, x15, x17
	adds	x8, x8, x25
	adcs	x9, x9, x26
	adc	x10, x10, xzr
	#  A[2] * B[3]
	mul	x25, x14, x20
	umulh	x26, x14, x20
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x15, x19
	umulh	x26, x15, x19
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, x11, xzr
	#  A[3] * B[3]
	mul	x25, x15, x20
	umulh	x26, x15, x20
	adds	x10, x10, x25
	adc	x11, x11, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x11, x11, x10, #63
	extr	x10, x10, x9, #63
	extr	x9, x9, x8, #63
	extr	x8, x8, x7, #63
	and	x7, x7, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x8
	umulh	x8, x25, x8
	adds	x4, x4, x26
	mul	x26, x25, x9
	umulh	x9, x25, x9
	adcs	x5, x5, x26
	mul	x26, x25, x10
	umulh	x10, x25, x10
	adcs	x6, x6, x26
	mul	x26, x25, x11
	umulh	x27, x25, x11
	adcs	x7, x7, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x5, x5, x8
	adcs	x6, x6, x9
	adcs	x7, x7, x10
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x7, #63
	mul	x27, x27, x25
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Reduce if top bit set
	and	x27, x25, x7, asr 63
	and	x7, x7, #0x7fffffffffffffff
	adds	x4, x4, x27
	adcs	x5, x5, xzr
	adcs	x6, x6, xzr
	adc	x7, x7, xzr
	# Store
	ldr	x0, [x29, #48]
	# Double
	adds	x4, x4, x4
	adcs	x5, x5, x5
	adcs	x6, x6, x6
	adc	x7, x7, x7
	mov	x25, #-19
	asr	x28, x7, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x4, x4, x25
	sbcs	x5, x5, x28
	sbcs	x6, x6, x28
	sbc	x7, x7, x26
	ldr	x0, [x29, #40]
	ldr	x1, [x29, #184]
	ldr	x2, [x29, #72]
	# Multiply
	ldp	x16, x17, [x1]
	ldp	x19, x20, [x1, #16]
	ldp	x21, x22, [x2]
	ldp	x23, x24, [x2, #16]
	#  A[0] * B[0]
	mul	x8, x16, x21
	umulh	x9, x16, x21
	#  A[0] * B[1]
	mul	x25, x16, x22
	umulh	x10, x16, x22
	adds	x9, x9, x25
	adc	x10, x10, xzr
	#  A[1] * B[0]
	mul	x25, x17, x21
	umulh	x26, x17, x21
	adds	x9, x9, x25
	adcs	x10, x10, x26
	adc	x11, xzr, xzr
	#  A[0] * B[2]
	mul	x25, x16, x23
	umulh	x26, x16, x23
	adds	x10, x10, x25
	adc	x11, x11, x26
	#  A[1] * B[1]
	mul	x25, x17, x22
	umulh	x26, x17, x22
	adds	x10, x10, x25
	adcs	x11, x11, x26
	adc	x12, xzr, xzr
	#  A[2] * B[0]
	mul	x25, x19, x21
	umulh	x26, x19, x21
	adds	x10, x10, x25
	adcs	x11, x11, x26
	adc	x12, x12, xzr
	#  A[0] * B[3]
	mul	x25, x16, x24
	umulh	x26, x16, x24
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, xzr, xzr
	#  A[1] * B[2]
	mul	x25, x17, x23
	umulh	x26, x17, x23
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, x13, xzr
	#  A[2] * B[1]
	mul	x25, x19, x22
	umulh	x26, x19, x22
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, x13, xzr
	#  A[3] * B[0]
	mul	x25, x20, x21
	umulh	x26, x20, x21
	adds	x11, x11, x25
	adcs	x12, x12, x26
	adc	x13, x13, xzr
	#  A[1] * B[3]
	mul	x25, x17, x24
	umulh	x26, x17, x24
	adds	x12, x12, x25
	adcs	x13, x13, x26
	adc	x14, xzr, xzr
	#  A[2] * B[2]
	mul	x25, x19, x23
	umulh	x26, x19, x23
	adds	x12, x12, x25
	adcs	x13, x13, x26
	adc	x14, x14, xzr
	#  A[3] * B[1]
	mul	x25, x20, x22
	umulh	x26, x20, x22
	adds	x12, x12, x25
	adcs	x13, x13, x26
	adc	x14, x14, xzr
	#  A[2] * B[3]
	mul	x25, x19, x24
	umulh	x26, x19, x24
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, xzr, xzr
	#  A[3] * B[2]
	mul	x25, x20, x23
	umulh	x26, x20, x23
	adds	x13, x13, x25
	adcs	x14, x14, x26
	adc	x15, x15, xzr
	#  A[3] * B[3]
	mul	x25, x20, x24
	umulh	x26, x20, x24
	adds	x14, x14, x25
	adc	x15, x15, x26
	# Reduce
	#  Move top half into t4-t7 and remove top bit from t3
	extr	x15, x15, x14, #63
	extr	x14, x14, x13, #63
	extr	x13, x13, x12, #63
	extr	x12, x12, x11, #63
	and	x11, x11, #0x7fffffffffffffff
	#  Multiply top half by 19
	mov	x25, #19
	mul	x26, x25, x12
	umulh	x12, x25, x12
	adds	x8, x8, x26
	mul	x26, x25, x13
	umulh	x13, x25, x13
	adcs	x9, x9, x26
	mul	x26, x25, x14
	umulh	x14, x25, x14
	adcs	x10, x10, x26
	mul	x26, x25, x15
	umulh	x27, x25, x15
	adcs	x11, x11, x26
	adc	x27, x27, xzr
	#  Add remaining product results in
	adds	x9, x9, x12
	adcs	x10, x10, x13
	adcs	x11, x11, x14
	adc	x27, x27, xzr
	#  Overflow
	extr	x27, x27, x11, #63
	mul	x27, x27, x25
	and	x11, x11, #0x7fffffffffffffff
	adds	x8, x8, x27
	adcs	x9, x9, xzr
	adcs	x10, x10, xzr
	adc	x11, x11, xzr
	# Reduce if top bit set
	and	x27, x25, x11, asr 63
	and	x11, x11, #0x7fffffffffffffff
	adds	x8, x8, x27
	adcs	x9, x9, xzr
	adcs	x10, x10, xzr
	adc	x11, x11, xzr
	# Store
	ldr	x0, [x29, #40]
	ldr	x1, [x29, #32]
	# Add
	adds	x12, x4, x8
	adcs	x13, x5, x9
	adcs	x14, x6, x10
	adc	x15, x7, x11
	mov	x25, #-19
	asr	x28, x15, #63
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Sub modulus (if overflow)
	subs	x12, x12, x25
	sbcs	x13, x13, x28
	sbcs	x14, x14, x28
	sbc	x15, x15, x26
	# Sub
	subs	x16, x4, x8
	sbcs	x17, x5, x9
	sbcs	x19, x6, x10
	sbcs	x20, x7, x11
	mov	x25, #-19
	csetm	x28, cc
	#   Mask the modulus
	and	x25, x28, x25
	and	x26, x28, #0x7fffffffffffffff
	#   Add modulus (if underflow)
	adds	x16, x16, x25
	adcs	x17, x17, x28
	adcs	x19, x19, x28
	adc	x20, x20, x26
	stp	x12, x13, [x0]
	stp	x14, x15, [x0, #16]
	stp	x16, x17, [x1]
	stp	x19, x20, [x1, #16]
	ldr	x17, [x29, #88]
	ldr	x19, [x29, #96]
	ldp	x20, x21, [x29, #104]
	ldp	x22, x23, [x29, #120]
	ldp	x24, x25, [x29, #136]
	ldp	x26, x27, [x29, #152]
	ldr	x28, [x29, #168]
	ldp	x29, x30, [sp], #0xb0
	ret
	.size	fe_ge_sub,.-fe_ge_sub
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */

#if defined(__linux__) && defined(__ELF__)
.section	.note.GNU-stack,"",%progbits
#endif
